# make a fresh dataframe with id, embedding, class

#### Importing cancerous parquet file and adding 1 as the class label to all of them

In [1]:
import pandas as pd

In [2]:
import pandas as pd
import numpy as np

# Load DataFrame
df = pd.read_parquet("/home/azureuser/dna_sequencing/Anushka/sampled_embeddings/embeddings_forw_can.parquet")

# Prepare data
embedding_cols = [col for col in df.columns if col.startswith("emb_")]
embeddings = df[embedding_cols].values.astype(np.float32)  # (n_samples, 768)
ids = df["id"].values  # optional: save other columns too

In [3]:
# Save embeddings as .npy (very fast and efficient)
np.save("/home/azureuser/dna_sequencing/model_training/embeddings_forw_can.npy", embeddings)

# Save ids to CSV
pd.DataFrame({"id": ids}).to_csv("/home/azureuser/dna_sequencing/model_training/embeddings_ids.csv", index=False)

In [7]:
import pandas as pd
import numpy as np

# Load DataFrame
df = pd.read_parquet("/home/azureuser/dna_sequencing/Anushka/sampled_embeddings/embeddings_forw_noncan.parquet")

# Prepare data
embedding_cols = [col for col in df.columns if col.startswith("emb_")]
embeddings = df[embedding_cols].values.astype(np.float32)  # (n_samples, 768)
ids = df["id"].values  # optional: save other columns too

In [8]:
# Save embeddings as .npy (very fast and efficient)
np.save("/home/azureuser/dna_sequencing/model_training/embeddings_forw_noncan.npy", embeddings)

# Save ids to CSV
pd.DataFrame({"id": ids}).to_csv("/home/azureuser/dna_sequencing/model_training/embeddings_ids2.csv", index=False)

In [4]:
import numpy as np
import pandas as pd

# === Load embeddings and IDs ===

# Cancerous
emb_cancer = np.load("/home/azureuser/dna_sequencing/model_training/embeddings_forw_can.npy")
ids_cancer = pd.read_csv("/home/azureuser/dna_sequencing/model_training/embeddings_ids.csv")["id"]
labels_cancer = np.ones(len(ids_cancer), dtype=int)  # label 1

# Non-cancerous
emb_noncan = np.load("/home/azureuser/dna_sequencing/model_training/embeddings_forw_noncan.npy")
ids_noncan = pd.read_csv("/home/azureuser/dna_sequencing/model_training/embeddings_ids2.csv")["id"]
labels_noncan = np.zeros(len(ids_noncan), dtype=int)  # label 0

# === Combine all ===
# Stack embeddings
X = np.vstack([emb_cancer, emb_noncan])

# Combine IDs and labels
all_ids = pd.concat([ids_cancer, ids_noncan], ignore_index=True)
all_labels = np.concatenate([labels_cancer, labels_noncan])

# === Final DataFrame ===
df_combined = pd.DataFrame({
    "id": all_ids,
    "label": all_labels,
    "embedding": list(X)  # list of 768-d vectors per row
})

# ✅ Preview
print(df_combined.head())
print(df_combined.shape)

              id  label                                          embedding
0  SRR5177930.19      1  [0.078438416, 0.23246941, -0.09033844, -0.5846...
1  SRR5177930.28      1  [0.8875472, -0.42859563, -0.34571132, -0.30669...
2  SRR5177930.38      1  [-1.5179863, -0.49738654, -0.35797656, 0.02045...
3  SRR5177930.39      1  [0.07762874, -0.4367457, -0.55085236, -1.23886...
4  SRR5177930.58      1  [-1.0721344, -0.41546282, 0.47115257, 0.017932...
(1153628, 3)


In [5]:
# Shuffle the rows of df_combined randomly
df_combined_shuffled = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)
df_combined_shuffled.head()

Unnamed: 0,id,label,embedding
0,SRR5177930.17201757,1,"[-0.43428636, 0.632174, -0.4035869, -0.9728157..."
1,SRR6269879.14204815,0,"[-0.22070847, 0.4197767, -0.636437, -0.8004252..."
2,SRR6269879.15909519,0,"[-0.36173493, 0.4392325, -0.24926521, 0.600837..."
3,SRR6269879.38607693,0,"[0.36612132, 1.3904184, -0.49522582, -1.127522..."
4,SRR6269879.16107409,0,"[-1.9264296, 1.6691118, 0.05958501, -1.8638941..."


In [6]:
import numpy as np
import pandas as pd
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import psutil
import os
import gc
import joblib

# === System-aware configuration ===
NUM_CORES = os.cpu_count()
RAM_GB = psutil.virtual_memory().available / (1024 ** 3)
print(f"🧠 CPU Cores: {NUM_CORES}")
print(f"💾 Available RAM: {RAM_GB:.2f} GB")

# === Load and Prepare Data ===
df = df_combined_shuffled.copy()
X = np.stack(df["embedding"].values).astype(np.float32)
y = df["label"].values.astype(np.uint8)
del df
gc.collect()

# === Train/Test Split ===
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# === Define Objective Function ===
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 200),
        "max_depth": trial.suggest_int("max_depth", 10, 50),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
        "n_jobs": NUM_CORES - 1,
        "random_state": 42,
        "verbose": 0
    }
    model = RandomForestClassifier(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    return f1_score(y_valid, y_pred, average="weighted")

# === Run Optuna Study ===
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=25, show_progress_bar=True)

# === Output Best Params ===
print(f"\n✅ Best Params: {study.best_params}")
print(f"⭐ Best F1 Score: {study.best_value:.4f}")

# === Train Final Model ===
best_model = RandomForestClassifier(
    **study.best_params,
    n_jobs=NUM_CORES - 1,
    random_state=42,
    verbose=1
)
best_model.fit(X, y)

# === Save Model ===
joblib.dump(best_model, "forw_final_randomforest_optuna_model.joblib")
print("💾 Saved as: forw_final_randomforest_optuna_model.joblib")

  from .autonotebook import tqdm as notebook_tqdm


🧠 CPU Cores: 8
💾 Available RAM: 33.37 GB


[I 2025-06-13 19:14:34,630] A new study created in memory with name: no-name-89c0547b-dac1-4d2f-9dea-516167c1a4b3
Best trial: 0. Best value: 0.638358:   4%|▍         | 1/25 [08:16<3:18:33, 496.41s/it]

[I 2025-06-13 19:22:51,042] Trial 0 finished with value: 0.638358266111603 and parameters: {'n_estimators': 186, 'max_depth': 39, 'max_features': 'log2'}. Best is trial 0 with value: 0.638358266111603.


Best trial: 0. Best value: 0.638358:   8%|▊         | 2/25 [23:49<4:48:46, 753.34s/it]

[I 2025-06-13 19:38:24,231] Trial 1 finished with value: 0.6352057579154752 and parameters: {'n_estimators': 159, 'max_depth': 15, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.638358266111603.


Best trial: 2. Best value: 0.639373:  12%|█▏        | 3/25 [48:41<6:39:53, 1090.61s/it]

[I 2025-06-13 20:03:16,200] Trial 2 finished with value: 0.6393734479503199 and parameters: {'n_estimators': 192, 'max_depth': 50, 'max_features': 'sqrt'}. Best is trial 2 with value: 0.6393734479503199.


Best trial: 2. Best value: 0.639373:  16%|█▌        | 4/25 [1:08:13<6:32:58, 1122.78s/it]

[I 2025-06-13 20:22:48,279] Trial 3 finished with value: 0.6388609648733419 and parameters: {'n_estimators': 152, 'max_depth': 48, 'max_features': 'sqrt'}. Best is trial 2 with value: 0.6393734479503199.


Best trial: 2. Best value: 0.639373:  20%|██        | 5/25 [1:23:27<5:49:11, 1047.59s/it]

[I 2025-06-13 20:38:02,560] Trial 4 finished with value: 0.6381387508734085 and parameters: {'n_estimators': 145, 'max_depth': 17, 'max_features': 'sqrt'}. Best is trial 2 with value: 0.6393734479503199.


Best trial: 2. Best value: 0.639373:  24%|██▍       | 6/25 [1:41:51<5:37:46, 1066.67s/it]

[I 2025-06-13 20:56:26,277] Trial 5 finished with value: 0.637691089870599 and parameters: {'n_estimators': 142, 'max_depth': 46, 'max_features': 'sqrt'}. Best is trial 2 with value: 0.6393734479503199.


Best trial: 2. Best value: 0.639373:  28%|██▊       | 7/25 [1:49:04<4:17:49, 859.42s/it] 

[I 2025-06-13 21:03:39,014] Trial 6 finished with value: 0.6305989152120998 and parameters: {'n_estimators': 53, 'max_depth': 46, 'max_features': 'sqrt'}. Best is trial 2 with value: 0.6393734479503199.


Best trial: 2. Best value: 0.639373:  32%|███▏      | 8/25 [2:05:46<4:16:22, 904.83s/it]

[I 2025-06-13 21:20:21,058] Trial 7 finished with value: 0.6380358862231883 and parameters: {'n_estimators': 128, 'max_depth': 43, 'max_features': 'sqrt'}. Best is trial 2 with value: 0.6393734479503199.


Best trial: 2. Best value: 0.639373:  36%|███▌      | 9/25 [2:10:39<3:10:15, 713.48s/it]

[I 2025-06-13 21:25:13,801] Trial 8 finished with value: 0.6224405421235991 and parameters: {'n_estimators': 169, 'max_depth': 11, 'max_features': 'log2'}. Best is trial 2 with value: 0.6393734479503199.


Best trial: 2. Best value: 0.639373:  40%|████      | 10/25 [2:16:41<2:31:13, 604.92s/it]

[I 2025-06-13 21:31:15,648] Trial 9 finished with value: 0.6367364197147197 and parameters: {'n_estimators': 137, 'max_depth': 30, 'max_features': 'log2'}. Best is trial 2 with value: 0.6393734479503199.


Best trial: 2. Best value: 0.639373:  44%|████▍     | 11/25 [2:20:31<1:54:24, 490.32s/it]

[I 2025-06-13 21:35:06,108] Trial 10 finished with value: 0.6325590457509592 and parameters: {'n_estimators': 86, 'max_depth': 32, 'max_features': 'log2'}. Best is trial 2 with value: 0.6393734479503199.


Best trial: 11. Best value: 0.639885:  48%|████▊     | 12/25 [2:45:43<2:53:33, 801.07s/it]

[I 2025-06-13 22:00:17,928] Trial 11 finished with value: 0.6398848735216744 and parameters: {'n_estimators': 197, 'max_depth': 50, 'max_features': 'sqrt'}. Best is trial 11 with value: 0.6398848735216744.


Best trial: 12. Best value: 0.640582:  52%|█████▏    | 13/25 [3:11:06<3:23:59, 1019.93s/it]

[I 2025-06-13 22:25:41,462] Trial 12 finished with value: 0.6405818846143012 and parameters: {'n_estimators': 200, 'max_depth': 36, 'max_features': 'sqrt'}. Best is trial 12 with value: 0.6405818846143012.


Best trial: 13. Best value: 0.641092:  56%|█████▌    | 14/25 [3:36:35<3:35:09, 1173.62s/it]

[I 2025-06-13 22:51:10,201] Trial 13 finished with value: 0.6410916915684092 and parameters: {'n_estimators': 200, 'max_depth': 35, 'max_features': 'sqrt'}. Best is trial 13 with value: 0.6410916915684092.


Best trial: 13. Best value: 0.641092:  60%|██████    | 15/25 [3:48:54<2:53:46, 1042.69s/it]

[I 2025-06-13 23:03:29,464] Trial 14 finished with value: 0.6364050136310268 and parameters: {'n_estimators': 98, 'max_depth': 32, 'max_features': 'sqrt'}. Best is trial 13 with value: 0.6410916915684092.


Best trial: 13. Best value: 0.641092:  64%|██████▍   | 16/25 [4:09:29<2:45:04, 1100.48s/it]

[I 2025-06-13 23:24:04,156] Trial 15 finished with value: 0.6409674705479788 and parameters: {'n_estimators': 171, 'max_depth': 25, 'max_features': 'sqrt'}. Best is trial 13 with value: 0.6410916915684092.


Best trial: 13. Best value: 0.641092:  68%|██████▊   | 17/25 [4:29:57<2:31:50, 1138.79s/it]

[I 2025-06-13 23:44:32,027] Trial 16 finished with value: 0.6409003757031632 and parameters: {'n_estimators': 173, 'max_depth': 24, 'max_features': 'sqrt'}. Best is trial 13 with value: 0.6410916915684092.


Best trial: 13. Best value: 0.641092:  72%|███████▏  | 18/25 [4:43:51<2:02:11, 1047.30s/it]

[I 2025-06-13 23:58:26,369] Trial 17 finished with value: 0.6391961329218678 and parameters: {'n_estimators': 113, 'max_depth': 25, 'max_features': 'sqrt'}. Best is trial 13 with value: 0.6410916915684092.


Best trial: 13. Best value: 0.641092:  76%|███████▌  | 19/25 [4:51:19<1:26:43, 867.25s/it] 

[I 2025-06-14 00:05:54,173] Trial 18 finished with value: 0.638689812868136 and parameters: {'n_estimators': 177, 'max_depth': 24, 'max_features': 'log2'}. Best is trial 13 with value: 0.6410916915684092.


Best trial: 19. Best value: 0.641241:  80%|████████  | 20/25 [5:10:34<1:19:28, 953.63s/it]

[I 2025-06-14 00:25:09,118] Trial 19 finished with value: 0.641240913702536 and parameters: {'n_estimators': 165, 'max_depth': 20, 'max_features': 'sqrt'}. Best is trial 19 with value: 0.641240913702536.


Best trial: 19. Best value: 0.641241:  84%|████████▍ | 21/25 [5:30:57<1:08:58, 1034.55s/it]

[I 2025-06-14 00:45:32,354] Trial 20 finished with value: 0.6409688555696174 and parameters: {'n_estimators': 184, 'max_depth': 19, 'max_features': 'sqrt'}. Best is trial 19 with value: 0.641240913702536.


Best trial: 19. Best value: 0.641241:  88%|████████▊ | 22/25 [5:49:56<53:17, 1065.72s/it]  

[I 2025-06-14 01:04:30,753] Trial 21 finished with value: 0.6387443281844554 and parameters: {'n_estimators': 184, 'max_depth': 17, 'max_features': 'sqrt'}. Best is trial 19 with value: 0.641240913702536.


Best trial: 22. Best value: 0.641507:  92%|█████████▏| 23/25 [6:08:08<35:47, 1073.81s/it]

[I 2025-06-14 01:22:43,424] Trial 22 finished with value: 0.6415073188742563 and parameters: {'n_estimators': 163, 'max_depth': 20, 'max_features': 'sqrt'}. Best is trial 22 with value: 0.6415073188742563.


Best trial: 23. Best value: 0.64154:  96%|█████████▌| 24/25 [6:26:28<18:01, 1081.52s/it] 

[I 2025-06-14 01:41:02,923] Trial 23 finished with value: 0.6415404721023737 and parameters: {'n_estimators': 161, 'max_depth': 21, 'max_features': 'sqrt'}. Best is trial 23 with value: 0.6415404721023737.


Best trial: 23. Best value: 0.64154: 100%|██████████| 25/25 [6:44:49<00:00, 971.60s/it] 


[I 2025-06-14 01:59:24,614] Trial 24 finished with value: 0.6413229879297145 and parameters: {'n_estimators': 158, 'max_depth': 21, 'max_features': 'sqrt'}. Best is trial 23 with value: 0.6415404721023737.

✅ Best Params: {'n_estimators': 161, 'max_depth': 21, 'max_features': 'sqrt'}
⭐ Best F1 Score: 0.6415


[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:  6.0min
[Parallel(n_jobs=7)]: Done 161 out of 161 | elapsed: 24.2min finished


💾 Saved as: forw_final_randomforest_optuna_model.joblib


In [None]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.metrics import make_scorer, f1_score
import numpy as np
import psutil
import os

# ==== System Info ====
NUM_CORES = os.cpu_count()
RAM_GB = psutil.virtual_memory().available / (1024 ** 3)
print(f"🧠 CPU Cores: {NUM_CORES}")
print(f"💾 Available RAM: {RAM_GB:.2f} GB")

# ==== Subsample the data ====
X_small, _, y_small, _ = train_test_split(X, y, train_size=50000, stratify=y, random_state=42)

# ==== Define objective ====
def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 100)
    max_depth = trial.suggest_int("max_depth", 10, 30)
    max_features = trial.suggest_categorical("max_features", ["sqrt", "log2"])

    clf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        max_features=max_features,
        n_jobs=1,  # No parallel inside trial
        random_state=42,
    )

    cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
    score = cross_val_score(
        clf, X_small, y_small, 
        scoring=make_scorer(f1_score, average="weighted"),
        cv=cv,
        n_jobs=1  # keep memory usage low
    ).mean()
    
    return score

# ==== Run study ====
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=25)

# ==== Results ====
print(f"✅ Best Trial: {study.best_trial.number}")
print(f"🎯 Best Params: {study.best_trial.params}")
print(f"⭐ Best Weighted F1 Score: {study.best_trial.value:.4f}")

  from .autonotebook import tqdm as notebook_tqdm
[I 2025-06-13 18:32:10,020] A new study created in memory with name: no-name-8e10c207-d798-4db4-b189-9c41b483333a


🧠 CPU Cores: 8
💾 Available RAM: 21.45 GB


  0%|          | 0/25 [00:00<?, ?it/s]

: 

In [8]:
import gc
import psutil
import os

# Run garbage collection
gc.collect()

# Print available RAM after cleanup
RAM_GB = psutil.virtual_memory().available / (1024 ** 3)
print(f"🧹 RAM cleared. Available RAM: {RAM_GB:.2f} GB")

🧹 RAM cleared. Available RAM: 21.66 GB


In [7]:
import gc
import sys
import os
import psutil

# Clean up everything
def clear_all():
    globals_ = list(globals().keys())
    for name in globals_:
        if not name.startswith("_") and name not in ['clear_all', 'os', 'gc', 'sys', 'psutil']:
            del globals()[name]
    gc.collect()

clear_all()
print("✅ Cleared all variables, calling garbage collector...")

# Extra: kill hanging background processes (be very careful)
def kill_children():
    current = psutil.Process()
    children = current.children(recursive=True)
    for child in children:
        try:
            child.kill()
        except Exception as e:
            print(f"⚠️ Couldn't kill {child}: {e}")
    print("🧨 Killed all child processes.")

kill_children()
gc.collect()

✅ Cleared all variables, calling garbage collector...
🧨 Killed all child processes.


0

Testing Model Performance

In [13]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import psutil
import os
import joblib
import gc
from tqdm import tqdm

# === System-aware config ===
NUM_CORES = os.cpu_count()
RAM_GB = psutil.virtual_memory().available / (1024 ** 3)

print(f"🧠 Using {NUM_CORES} CPU cores")
print(f"🗂️  Available RAM: {RAM_GB:.2f} GB")

# === Assume df_combined_shuffled is loaded ===
df = df_combined_shuffled

# === Convert to arrays ===
X = np.stack(df["embedding"].values).astype(np.float32)
y = df["label"].to_numpy(dtype=np.uint8)

del df
gc.collect()

# === Split data ===
split_idx = int(0.8 * len(X))
X_train, y_train = X[:split_idx], y[:split_idx]
X_test, y_test = X[split_idx:], y[split_idx:]

del X, y
gc.collect()

🧠 Using 8 CPU cores
🗂️  Available RAM: 27.52 GB


0

In [14]:
import joblib
import numpy as np
from sklearn.metrics import classification_report

# ─── Load the model ───
model_path = "/home/azureuser/dna_sequencing/model_training/forw_final_randomforest_optuna_model.joblib"
model = joblib.load(model_path)

# # ─── Load your test data ───
# # Replace with actual loading if stored in .npy or other format
# X_test = np.load("X_test_forward.npy")  # Embeddings for forward test sequences
# y_test = np.load("y_test_forward.npy")  # Ground truth labels

# ─── Predict and evaluate ───
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred, digits=2)

print("Classification Report:\n")
print(report)

[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    1.1s
[Parallel(n_jobs=7)]: Done 161 out of 161 | elapsed:    4.8s finished


Classification Report:

              precision    recall  f1-score   support

           0       0.97      0.98      0.97    110150
           1       0.98      0.97      0.98    120576

    accuracy                           0.98    230726
   macro avg       0.98      0.98      0.98    230726
weighted avg       0.98      0.98      0.98    230726



In [None]:
import joblib
import numpy as np
from sklearn.metrics import classification_report

# ─── Load the model ───
model_path = "/home/azureuser/dna_sequencing/model_training/random_forest_dnabert_model.joblib"
model = joblib.load(model_path)

# # ─── Load your test data ───
# # Replace with actual loading if stored in .npy or other format
# X_test = np.load("X_test_forward.npy")  # Embeddings for forward test sequences
# y_test = np.load("y_test_forward.npy")  # Ground truth labels

# ─── Predict and evaluate ───
# ─── Predict and evaluate ───
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred, digits=2)

print("Classification Report:\n")
print(report)

Classification Report:

              precision    recall  f1-score   support

           0       0.63      0.59      0.61    110150
           1       0.65      0.68      0.67    120576

    accuracy                           0.64    230726
   macro avg       0.64      0.64      0.64    230726
weighted avg       0.64      0.64      0.64    230726



: 