# make a fresh dataframe with id, embedding, class

#### Importing cancerous parquet file and adding 1 as the class label to all of them

In [1]:
import pandas as pd

In [2]:
import pandas as pd
import numpy as np

# Load DataFrame
df = pd.read_parquet("/home/azureuser/dna_sequencing/Anushka/sampled_embeddings/embeddings_backw_can.parquet")

# Prepare data
embedding_cols = [col for col in df.columns if col.startswith("emb_")]
embeddings = df[embedding_cols].values.astype(np.float32)  # (n_samples, 768)
ids = df["id"].values  # optional: save other columns too

In [3]:
# Save embeddings as .npy (very fast and efficient)
np.save("/home/azureuser/dna_sequencing/model_training/embeddings_backw_can.npy", embeddings)

# Save ids to CSV
pd.DataFrame({"id": ids}).to_csv("/home/azureuser/dna_sequencing/model_training/backw_can_embeddings_ids.csv", index=False)

In [4]:
import pandas as pd
import numpy as np

# Load DataFrame
df = pd.read_parquet("/home/azureuser/dna_sequencing/Anushka/sampled_embeddings/embeddings_backw_noncan.parquet")

# Prepare data
embedding_cols = [col for col in df.columns if col.startswith("emb_")]
embeddings = df[embedding_cols].values.astype(np.float32)  # (n_samples, 768)
ids = df["id"].values  # optional: save other columns too

In [5]:
# Save embeddings as .npy (very fast and efficient)
np.save("/home/azureuser/dna_sequencing/model_training/embeddings_backw_noncan.npy", embeddings)

# Save ids to CSV
pd.DataFrame({"id": ids}).to_csv("/home/azureuser/dna_sequencing/model_training/backw_noncan_embeddings_ids.csv", index=False)

In [1]:
import numpy as np
import pandas as pd

# === Load embeddings and IDs ===

# Cancerous
emb_cancer = np.load("/home/azureuser/dna_sequencing/model_training/embeddings_backw_can.npy")
ids_cancer = pd.read_csv("/home/azureuser/dna_sequencing/model_training/backw_can_embeddings_ids.csv")["id"]
labels_cancer = np.ones(len(ids_cancer), dtype=int)  # label 1

# Non-cancerous
emb_noncan = np.load("/home/azureuser/dna_sequencing/model_training/embeddings_backw_noncan.npy")
ids_noncan = pd.read_csv("/home/azureuser/dna_sequencing/model_training/backw_noncan_embeddings_ids.csv")["id"]
labels_noncan = np.zeros(len(ids_noncan), dtype=int)  # label 0

# === Combine all ===
# Stack embeddings
X = np.vstack([emb_cancer, emb_noncan])

# Combine IDs and labels
all_ids = pd.concat([ids_cancer, ids_noncan], ignore_index=True)
all_labels = np.concatenate([labels_cancer, labels_noncan])

# === Final DataFrame ===
df_combined = pd.DataFrame({
    "id": all_ids,
    "label": all_labels,
    "embedding": list(X)  # list of 768-d vectors per row
})

# ✅ Preview
print(df_combined.head())
print(df_combined.shape)

              id  label                                          embedding
0   SRR5177930.6      1  [-0.472982, 0.902241, -0.68760836, -1.0110122,...
1   SRR5177930.9      1  [-0.14832692, 0.1271801, -0.08398379, -1.39160...
2  SRR5177930.11      1  [0.12253284, 1.1420611, 1.2350746, -1.3126705,...
3  SRR5177930.16      1  [-0.28678215, 3.929915, 0.13052358, -0.0365948...
4  SRR5177930.20      1  [-0.93153316, 0.63785285, -0.8242705, -0.75851...
(1151263, 3)


In [2]:
# Shuffle the rows of df_combined randomly
df_combined_shuffled = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)
df_combined_shuffled.head()

Unnamed: 0,id,label,embedding
0,SRR6269879.1807337,0,"[-1.0272862, -0.36884394, -1.2709365, -0.75294..."
1,SRR5177930.18806323,1,"[-0.26374084, 1.7980592, 1.2168257, -0.0639829..."
2,SRR5177930.24104586,1,"[-0.43246013, 0.097811565, -0.26134557, 0.1590..."
3,SRR5177930.52104121,1,"[0.20100069, 0.0065315273, 0.087734945, -0.161..."
4,SRR6269879.48804038,0,"[-1.1723969, 0.7453982, 0.5968736, -2.174962, ..."


In [9]:
df_combined_shuffled['label'].value_counts()

label
1    603058
0    548205
Name: count, dtype: int64

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import psutil
import os
import joblib
import gc
from tqdm import tqdm

# === System-aware config ===
NUM_CORES = os.cpu_count()
RAM_GB = psutil.virtual_memory().available / (1024 ** 3)

print(f"🧠 Using {NUM_CORES} CPU cores")
print(f"🗂️  Available RAM: {RAM_GB:.2f} GB")

# === Assume df_combined_shuffled is loaded ===
df = df_combined_shuffled

# === Convert to arrays ===
X = np.stack(df["embedding"].values).astype(np.float32)
y = df["label"].to_numpy(dtype=np.uint8)

del df
gc.collect()

# === Split data ===
split_idx = int(0.8 * len(X))
X_train, y_train = X[:split_idx], y[:split_idx]
X_test, y_test = X[split_idx:], y[split_idx:]

del X, y
gc.collect()

# === Adaptive hyperparams ===
if RAM_GB > 16:
    total_trees = 200
    max_depth = None
elif RAM_GB > 8:
    total_trees = 100
    max_depth = 30
else:
    total_trees = 50
    max_depth = 20

# === Progressive training with warm_start ===
clf = RandomForestClassifier(
    n_estimators=1,
    warm_start=True,
    max_depth=max_depth,
    n_jobs=max(1, NUM_CORES - 1),
    random_state=42
)

print("🚀 Training RandomForest with progress bar:")
for i in tqdm(range(1, total_trees + 1), desc="🌲 Trees trained"):
    clf.set_params(n_estimators=i)
    clf.fit(X_train, y_train)

# === Save model ===
MODEL_PATH = "/home/azureuser/dna_sequencing/model_training/backw_random_forest_dnabert_model.joblib"
joblib.dump(clf, MODEL_PATH)
print(f"💾 Model saved to: {MODEL_PATH}")

# === Evaluate ===
y_pred = clf.predict(X_test)
print("\n📊 Classification Report:")
print(classification_report(y_test, y_pred))

# === Cleanup ===
del X_train, y_train, X_test, y_test
gc.collect()

🧠 Using 8 CPU cores
🗂️  Available RAM: 24.67 GB
🚀 Training RandomForest with progress bar:


🌲 Trees trained: 100%|██████████| 200/200 [3:33:50<00:00, 64.15s/it]  


💾 Model saved to: /home/azureuser/dna_sequencing/model_training/backw_random_forest_dnabert_model.joblib

📊 Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.85      0.83    109731
           1       0.86      0.82      0.84    120522

    accuracy                           0.84    230253
   macro avg       0.84      0.84      0.84    230253
weighted avg       0.84      0.84      0.84    230253



52

: 

### With hyperparameter tuning

In [3]:
import numpy as np
import pandas as pd
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import psutil
import os
import gc
import joblib

# === System-aware configuration ===
NUM_CORES = os.cpu_count()
RAM_GB = psutil.virtual_memory().available / (1024 ** 3)
print(f"🧠 CPU Cores: {NUM_CORES}")
print(f"💾 Available RAM: {RAM_GB:.2f} GB")

# === Load and Prepare Data ===
df = df_combined_shuffled.copy()
X = np.stack(df["embedding"].values).astype(np.float32)
y = df["label"].values.astype(np.uint8)
del df
gc.collect()

# === Train/Test Split ===
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# === Define Objective Function ===
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 200),
        "max_depth": trial.suggest_int("max_depth", 10, 50),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
        "n_jobs": NUM_CORES - 1,
        "random_state": 42,
        "verbose": 0
    }
    model = RandomForestClassifier(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    return f1_score(y_valid, y_pred, average="weighted")

# === Run Optuna Study ===
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=25, show_progress_bar=True)

# === Output Best Params ===
print(f"\n✅ Best Params: {study.best_params}")
print(f"⭐ Best F1 Score: {study.best_value:.4f}")

# === Train Final Model ===
best_model = RandomForestClassifier(
    **study.best_params,
    n_jobs=NUM_CORES - 1,
    random_state=42,
    verbose=1
)
best_model.fit(X, y)

# === Save Model ===
joblib.dump(best_model, "/home/azureuser/dna_sequencing/model_training/backw_final_randomforest_optuna_model.joblib")
print("💾 Saved as: backw_final_randomforest_optuna_model.joblib")

  from .autonotebook import tqdm as notebook_tqdm


🧠 CPU Cores: 8
💾 Available RAM: 9.38 GB


[I 2025-06-15 17:32:59,442] A new study created in memory with name: no-name-e0af7629-9e36-4745-8aa0-a1a8e9771f62
Best trial: 0. Best value: 0.833064:   4%|▍         | 1/25 [25:46<10:18:44, 1546.84s/it]

[I 2025-06-15 17:58:46,285] Trial 0 finished with value: 0.8330642638846015 and parameters: {'n_estimators': 137, 'max_depth': 50, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.8330642638846015.


Best trial: 0. Best value: 0.833064:   8%|▊         | 2/25 [30:13<5:04:11, 793.53s/it]  

[I 2025-06-15 18:03:12,491] Trial 1 finished with value: 0.8267543307611193 and parameters: {'n_estimators': 77, 'max_depth': 49, 'max_features': 'log2'}. Best is trial 0 with value: 0.8330642638846015.


Best trial: 0. Best value: 0.833064:  12%|█▏        | 3/25 [59:17<7:30:05, 1227.53s/it]

[I 2025-06-15 18:32:16,485] Trial 2 finished with value: 0.8326223450809356 and parameters: {'n_estimators': 184, 'max_depth': 26, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.8330642638846015.


Best trial: 0. Best value: 0.833064:  16%|█▌        | 4/25 [1:05:17<5:09:51, 885.33s/it] 

[I 2025-06-15 18:38:17,223] Trial 3 finished with value: 0.8279378396315805 and parameters: {'n_estimators': 104, 'max_depth': 48, 'max_features': 'log2'}. Best is trial 0 with value: 0.8330642638846015.


Best trial: 0. Best value: 0.833064:  20%|██        | 5/25 [1:19:49<4:53:30, 880.50s/it]

[I 2025-06-15 18:52:49,160] Trial 4 finished with value: 0.810863772488977 and parameters: {'n_estimators': 129, 'max_depth': 13, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.8330642638846015.


Best trial: 0. Best value: 0.833064:  24%|██▍       | 6/25 [1:30:39<4:14:00, 802.13s/it]

[I 2025-06-15 19:03:39,161] Trial 5 finished with value: 0.802145347602262 and parameters: {'n_estimators': 104, 'max_depth': 11, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.8330642638846015.


Best trial: 0. Best value: 0.833064:  28%|██▊       | 7/25 [1:33:34<2:59:06, 597.00s/it]

[I 2025-06-15 19:06:33,843] Trial 6 finished with value: 0.8112663973481642 and parameters: {'n_estimators': 70, 'max_depth': 15, 'max_features': 'log2'}. Best is trial 0 with value: 0.8330642638846015.


Best trial: 0. Best value: 0.833064:  32%|███▏      | 8/25 [1:45:46<3:01:20, 640.01s/it]

[I 2025-06-15 19:18:45,953] Trial 7 finished with value: 0.8258774775435674 and parameters: {'n_estimators': 86, 'max_depth': 19, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.8330642638846015.


Best trial: 0. Best value: 0.833064:  36%|███▌      | 9/25 [2:02:45<3:22:15, 758.46s/it]

[I 2025-06-15 19:35:44,861] Trial 8 finished with value: 0.8024117059212099 and parameters: {'n_estimators': 167, 'max_depth': 11, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.8330642638846015.


Best trial: 0. Best value: 0.833064:  40%|████      | 10/25 [2:08:39<2:38:22, 633.52s/it]

[I 2025-06-15 19:41:38,601] Trial 9 finished with value: 0.8283756080006696 and parameters: {'n_estimators': 100, 'max_depth': 39, 'max_features': 'log2'}. Best is trial 0 with value: 0.8330642638846015.


Best trial: 10. Best value: 0.833129:  44%|████▍     | 11/25 [2:33:09<3:27:35, 889.71s/it]

[I 2025-06-15 20:06:09,199] Trial 10 finished with value: 0.8331288601783071 and parameters: {'n_estimators': 140, 'max_depth': 37, 'max_features': 'sqrt'}. Best is trial 10 with value: 0.8331288601783071.


Best trial: 11. Best value: 0.833307:  48%|████▊     | 12/25 [3:00:18<4:01:27, 1114.41s/it]

[I 2025-06-15 20:33:17,560] Trial 11 finished with value: 0.8333070863577194 and parameters: {'n_estimators': 151, 'max_depth': 39, 'max_features': 'sqrt'}. Best is trial 11 with value: 0.8333070863577194.


Best trial: 11. Best value: 0.833307:  52%|█████▏    | 13/25 [3:26:43<4:11:24, 1257.06s/it]

[I 2025-06-15 20:59:42,862] Trial 12 finished with value: 0.8329504353067942 and parameters: {'n_estimators': 151, 'max_depth': 35, 'max_features': 'sqrt'}. Best is trial 11 with value: 0.8333070863577194.


Best trial: 13. Best value: 0.834254:  56%|█████▌    | 14/25 [4:02:52<4:40:59, 1532.68s/it]

[I 2025-06-15 21:35:52,431] Trial 13 finished with value: 0.8342537396052184 and parameters: {'n_estimators': 200, 'max_depth': 41, 'max_features': 'sqrt'}. Best is trial 13 with value: 0.8342537396052184.


Best trial: 14. Best value: 0.83431:  60%|██████    | 15/25 [4:38:20<4:45:19, 1711.93s/it] 

[I 2025-06-15 22:11:19,752] Trial 14 finished with value: 0.8343099354596286 and parameters: {'n_estimators': 195, 'max_depth': 42, 'max_features': 'sqrt'}. Best is trial 14 with value: 0.8343099354596286.


Best trial: 15. Best value: 0.834319:  64%|██████▍   | 16/25 [5:13:46<4:35:28, 1836.53s/it]

[I 2025-06-15 22:46:45,649] Trial 15 finished with value: 0.8343186412921124 and parameters: {'n_estimators': 193, 'max_depth': 43, 'max_features': 'sqrt'}. Best is trial 15 with value: 0.8343186412921124.


Best trial: 15. Best value: 0.834319:  68%|██████▊   | 17/25 [5:46:50<4:10:48, 1881.09s/it]

[I 2025-06-15 23:19:50,367] Trial 16 finished with value: 0.8333721559511822 and parameters: {'n_estimators': 199, 'max_depth': 30, 'max_features': 'sqrt'}. Best is trial 15 with value: 0.8343186412921124.


Best trial: 15. Best value: 0.834319:  72%|███████▏  | 18/25 [6:18:49<3:40:45, 1892.26s/it]

[I 2025-06-15 23:51:48,638] Trial 17 finished with value: 0.8336942397899472 and parameters: {'n_estimators': 175, 'max_depth': 44, 'max_features': 'sqrt'}. Best is trial 15 with value: 0.8343186412921124.


Best trial: 15. Best value: 0.834319:  76%|███████▌  | 19/25 [6:28:58<2:30:40, 1506.80s/it]

[I 2025-06-16 00:01:57,503] Trial 18 finished with value: 0.8291934870808564 and parameters: {'n_estimators': 185, 'max_depth': 32, 'max_features': 'log2'}. Best is trial 15 with value: 0.8343186412921124.


Best trial: 15. Best value: 0.834319:  80%|████████  | 20/25 [6:59:48<2:14:09, 1609.96s/it]

[I 2025-06-16 00:32:47,871] Trial 19 finished with value: 0.8339714803619228 and parameters: {'n_estimators': 167, 'max_depth': 45, 'max_features': 'sqrt'}. Best is trial 15 with value: 0.8343186412921124.


Best trial: 15. Best value: 0.834319:  84%|████████▍ | 21/25 [7:08:19<1:25:20, 1280.10s/it]

[I 2025-06-16 00:41:18,906] Trial 20 finished with value: 0.8277500294351772 and parameters: {'n_estimators': 53, 'max_depth': 25, 'max_features': 'sqrt'}. Best is trial 15 with value: 0.8343186412921124.


Best trial: 21. Best value: 0.834327:  88%|████████▊ | 22/25 [7:45:41<1:18:26, 1568.68s/it]

[I 2025-06-16 01:18:40,566] Trial 21 finished with value: 0.8343273387983957 and parameters: {'n_estimators': 197, 'max_depth': 43, 'max_features': 'sqrt'}. Best is trial 21 with value: 0.8343273387983957.


Best trial: 21. Best value: 0.834327:  92%|█████████▏| 23/25 [8:19:35<56:56, 1708.45s/it]  

[I 2025-06-16 01:52:35,041] Trial 22 finished with value: 0.834262668022684 and parameters: {'n_estimators': 186, 'max_depth': 43, 'max_features': 'sqrt'}. Best is trial 21 with value: 0.8343273387983957.


Best trial: 21. Best value: 0.834327:  96%|█████████▌| 24/25 [8:47:19<28:15, 1695.13s/it]

[I 2025-06-16 02:20:19,100] Trial 23 finished with value: 0.8328683259746621 and parameters: {'n_estimators': 161, 'max_depth': 35, 'max_features': 'sqrt'}. Best is trial 21 with value: 0.8343273387983957.


Best trial: 24. Best value: 0.834745: 100%|██████████| 25/25 [9:24:18<00:00, 1354.33s/it]


[I 2025-06-16 02:57:17,795] Trial 24 finished with value: 0.834744975121074 and parameters: {'n_estimators': 200, 'max_depth': 46, 'max_features': 'sqrt'}. Best is trial 24 with value: 0.834744975121074.

✅ Best Params: {'n_estimators': 200, 'max_depth': 46, 'max_features': 'sqrt'}
⭐ Best F1 Score: 0.8347


[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed: 10.1min
[Parallel(n_jobs=7)]: Done 186 tasks      | elapsed: 47.3min
[Parallel(n_jobs=7)]: Done 200 out of 200 | elapsed: 50.7min finished


💾 Saved as: backw_final_randomforest_optuna_model.joblib


In [5]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# === Prepare Data ===
X = np.stack(df_combined_shuffled["embedding"].values).astype(np.float32)
y = df_combined_shuffled["label"].values.astype(np.uint8)

# === Train/Test Split ===
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# === Use Best Found Hyperparameters ===
best_params = {
    "n_estimators": 177,
    "max_depth": 39,
    "max_features": "sqrt",
    "n_jobs": -1,
    "random_state": 42,
    "verbose": 1
}

# === Train the Model ===
model = RandomForestClassifier(**best_params)
model.fit(X_train, y_train)

# === Predict and Report ===
y_pred = model.predict(X_valid)
print("\n📊 Classification Report (Validation Set):")
print(classification_report(y_valid, y_pred, digits=2))

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  5.6min
[Parallel(n_jobs=-1)]: Done 177 out of 177 | elapsed: 25.3min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 177 out of 177 | elapsed:    4.2s finished



📊 Classification Report (Validation Set):
              precision    recall  f1-score   support

           0       0.81      0.85      0.83    109641
           1       0.86      0.82      0.84    120612

    accuracy                           0.83    230253
   macro avg       0.83      0.83      0.83    230253
weighted avg       0.83      0.83      0.83    230253



In [6]:
import joblib

# === Save the Trained Model ===
joblib.dump(model, "/home/azureuser/dna_sequencing/model_training/backw_final_randomforest_optuna_model.joblib")
print("💾 Model saved as: backw_final_randomforest_optuna_model.joblib")

💾 Model saved as: backw_final_randomforest_optuna_model.joblib
