#### Checking the Prediction of Forward: Fine Tuned Model

In [3]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel

# Constants (ensure they match your pipeline)
MODEL_NAME = "zhihan1996/DNA_bert_6"
KMER = 6
MAX_LENGTH = 512  # Ensure same as your training

# Load model/tokenizer once
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModel.from_pretrained(MODEL_NAME, trust_remote_code=True).to(device)
model.eval()

def embed_single_sequence(sequence: str) -> np.ndarray:
    """
    Embeds a single DNA sequence using the same pipeline as training.

    Returns:
        A (768,) numpy array embedding from the [CLS] token.
    """
    # Apply K-mer transformation
    sequence = sequence.upper().replace("N", "")
    kmers = ' '.join([sequence[i:i+KMER] for i in range(0, len(sequence) - KMER + 1)])

    # Tokenize and convert to tensor
    inputs = tokenizer(kmers, return_tensors='pt', padding='max_length', truncation=True, max_length=MAX_LENGTH)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # Inference
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token

    return embedding.squeeze(0).cpu().numpy()  # shape: (768,)


Some weights of the model checkpoint at zhihan1996/DNA_bert_6 were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
# Load saved model
import joblib
rf_model = joblib.load("/home/azureuser/dna_sequencing/model_training/forw_final_randomforest_optuna_model.joblib")

In [15]:
import pandas as pd
import random
import glob

# Path to the parquet file (adjust if needed)
parquet_path = "/home/azureuser/dna_sequencing/clean_forward_noncan/clean_reads_batch_65.parquet"

# Load the file
df = pd.read_parquet(parquet_path)

# Check size first to avoid IndexError
if len(df) > 2530:
    example_seq = df.iloc[2530]['sequence']  # 2531st row, zero-indexed
    print(f"🧬 Sequence (Index 2530):\n{example_seq}")
    print(f"🔢 Length of sequence: {len(example_seq)}")
else:
    print(f"⚠️ File has only {len(df)} rows. Index 2530 is out of range.")

print(f"🧬 Testing Sequence: {example_seq}")

# Embed and predict
embedding = embed_single_sequence(example_seq)
prediction = rf_model.predict([embedding])[0]
print(f"🔬 Prediction: {'Cancerous' if prediction == 1 else 'Non-Cancerous'}")

🧬 Sequence (Index 2530):
TAATAAGTTAAATGTTTTGTAGTTTAAGAAATTAATTAAAATCTTAACATTGTTTTGTTTCTTAGTTATTTTGTTGGGATGTGTGGTGATGGCGCAAATG
🔢 Length of sequence: 100
🧬 Testing Sequence: TAATAAGTTAAATGTTTTGTAGTTTAAGAAATTAATTAAAATCTTAACATTGTTTTGTTTCTTAGTTATTTTGTTGGGATGTGTGGTGATGGCGCAAATG
🔬 Prediction: Non-Cancerous


[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.0s
[Parallel(n_jobs=7)]: Done 161 out of 161 | elapsed:    0.0s finished


In [16]:
# Embed the test sequence
embedding = embed_single_sequence(example_seq)

# Predict class and probability
pred_class = rf_model.predict([embedding])[0]
pred_proba = rf_model.predict_proba([embedding])[0]  # returns list of probabilities per class

# Print results
class_names = ["Non-Cancerous", "Cancerous"]
predicted_label = class_names[pred_class]
confidence = pred_proba[pred_class] * 100

print(f"🧬 Input Sequence: {example_seq}")
print(f"🔬 Predicted Class: {predicted_label}")
print(f"📈 Confidence: {confidence:.2f}%")

🧬 Input Sequence: TAATAAGTTAAATGTTTTGTAGTTTAAGAAATTAATTAAAATCTTAACATTGTTTTGTTTCTTAGTTATTTTGTTGGGATGTGTGGTGATGGCGCAAATG
🔬 Predicted Class: Non-Cancerous
📈 Confidence: 81.56%


[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.0s
[Parallel(n_jobs=7)]: Done 161 out of 161 | elapsed:    0.0s finished
[Parallel(n_jobs=7)]: Using backend ThreadingBackend with 7 concurrent workers.
[Parallel(n_jobs=7)]: Done  36 tasks      | elapsed:    0.0s
[Parallel(n_jobs=7)]: Done 161 out of 161 | elapsed:    0.0s finished


#### Checking Backward

In [6]:
import numpy as np
import pandas as pd

# === Load embeddings and IDs ===

# Cancerous
emb_cancer = np.load("/home/azureuser/dna_sequencing/model_training/embeddings_backw_can.npy")
ids_cancer = pd.read_csv("/home/azureuser/dna_sequencing/model_training/backw_can_embeddings_ids.csv")["id"]
labels_cancer = np.ones(len(ids_cancer), dtype=int)  # label 1

# Non-cancerous
emb_noncan = np.load("/home/azureuser/dna_sequencing/model_training/embeddings_backw_noncan.npy")
ids_noncan = pd.read_csv("/home/azureuser/dna_sequencing/model_training/backw_noncan_embeddings_ids.csv")["id"]
labels_noncan = np.zeros(len(ids_noncan), dtype=int)  # label 0

# === Combine all ===
# Stack embeddings
X = np.vstack([emb_cancer, emb_noncan])

# Combine IDs and labels
all_ids = pd.concat([ids_cancer, ids_noncan], ignore_index=True)
all_labels = np.concatenate([labels_cancer, labels_noncan])

# === Final DataFrame ===
df_combined = pd.DataFrame({
    "id": all_ids,
    "label": all_labels,
    "embedding": list(X)  # list of 768-d vectors per row
})

# ✅ Preview
print(df_combined.head())
print(df_combined.shape)


# Shuffle df
df_combined_shuffled = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)
df_combined_shuffled.head()

              id  label                                          embedding
0   SRR5177930.6      1  [-0.472982, 0.902241, -0.68760836, -1.0110122,...
1   SRR5177930.9      1  [-0.14832692, 0.1271801, -0.08398379, -1.39160...
2  SRR5177930.11      1  [0.12253284, 1.1420611, 1.2350746, -1.3126705,...
3  SRR5177930.16      1  [-0.28678215, 3.929915, 0.13052358, -0.0365948...
4  SRR5177930.20      1  [-0.93153316, 0.63785285, -0.8242705, -0.75851...
(1151263, 3)


Unnamed: 0,id,label,embedding
0,SRR6269879.1807337,0,"[-1.0272862, -0.36884394, -1.2709365, -0.75294..."
1,SRR5177930.18806323,1,"[-0.26374084, 1.7980592, 1.2168257, -0.0639829..."
2,SRR5177930.24104586,1,"[-0.43246013, 0.097811565, -0.26134557, 0.1590..."
3,SRR5177930.52104121,1,"[0.20100069, 0.0065315273, 0.087734945, -0.161..."
4,SRR6269879.48804038,0,"[-1.1723969, 0.7453982, 0.5968736, -2.174962, ..."


In [7]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import psutil
import os
import joblib
import gc
from tqdm import tqdm

# === System-aware config ===
NUM_CORES = os.cpu_count()
RAM_GB = psutil.virtual_memory().available / (1024 ** 3)

print(f"🧠 Using {NUM_CORES} CPU cores")
print(f"🗂️  Available RAM: {RAM_GB:.2f} GB")

# === Assume df_combined_shuffled is loaded ===
df = df_combined_shuffled

# === Convert to arrays ===
X = np.stack(df["embedding"].values).astype(np.float32)
y = df["label"].to_numpy(dtype=np.uint8)

del df
gc.collect()

# === Split data ===
split_idx = int(0.8 * len(X))
X_train, y_train = X[:split_idx], y[:split_idx]
X_test, y_test = X[split_idx:], y[split_idx:]

del X, y
gc.collect()

🧠 Using 8 CPU cores
🗂️  Available RAM: 18.10 GB


0

In [19]:
import joblib
import numpy as np
from sklearn.metrics import classification_report

# ─── Load the model ───
model_path = "/home/azureuser/dna_sequencing/model_training/backw_random_forest_dnabert_model.joblib"
model = joblib.load(model_path)

# # ─── Load your test data ───
# # Replace with actual loading if stored in .npy or other format
# X_test = np.load("X_test_forward.npy")  # Embeddings for forward test sequences
# y_test = np.load("y_test_forward.npy")  # Ground truth labels

# ─── Predict and evaluate ───
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred, digits=2)

print("Classification Report:\n")
print(report)

Classification Report:

              precision    recall  f1-score   support

           0       0.81      0.85      0.83    109731
           1       0.86      0.82      0.84    120522

    accuracy                           0.84    230253
   macro avg       0.84      0.84      0.84    230253
weighted avg       0.84      0.84      0.84    230253



In [8]:
import joblib
import numpy as np
from sklearn.metrics import classification_report

# ─── Load the model ───
model_path = "/home/azureuser/dna_sequencing/model_training/backw_final_randomforest_optuna_model.joblib"
model = joblib.load(model_path)

# # ─── Load your test data ───
# # Replace with actual loading if stored in .npy or other format
# X_test = np.load("X_test_forward.npy")  # Embeddings for forward test sequences
# y_test = np.load("y_test_forward.npy")  # Ground truth labels

# ─── Predict and evaluate ───
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred, digits=2)

print("Classification Report:\n")
print(report)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 177 out of 177 | elapsed:    3.8s finished


Classification Report:

              precision    recall  f1-score   support

           0       0.96      0.96      0.96    109731
           1       0.96      0.96      0.96    120522

    accuracy                           0.96    230253
   macro avg       0.96      0.96      0.96    230253
weighted avg       0.96      0.96      0.96    230253



In [5]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred))

[[105265   4466]
 [  4351 116171]]
