In [1]:
import numpy as np
import pandas as pd

# === Load embeddings and IDs ===

# Cancerous
emb_cancer = np.load("/home/azureuser/dna_sequencing/model_training/embeddings_backw_can.npy")
ids_cancer = pd.read_csv("/home/azureuser/dna_sequencing/model_training/backw_can_embeddings_ids.csv")["id"]
labels_cancer = np.ones(len(ids_cancer), dtype=int)  # label 1

# Non-cancerous
emb_noncan = np.load("/home/azureuser/dna_sequencing/model_training/embeddings_backw_noncan.npy")
ids_noncan = pd.read_csv("/home/azureuser/dna_sequencing/model_training/backw_noncan_embeddings_ids.csv")["id"]
labels_noncan = np.zeros(len(ids_noncan), dtype=int)  # label 0

# === Combine all ===
# Stack embeddings
X = np.vstack([emb_cancer, emb_noncan])

# Combine IDs and labels
all_ids = pd.concat([ids_cancer, ids_noncan], ignore_index=True)
all_labels = np.concatenate([labels_cancer, labels_noncan])

# === Final DataFrame ===
df_combined = pd.DataFrame({
    "id": all_ids,
    "label": all_labels,
    "embedding": list(X)  # list of 768-d vectors per row
})

# ✅ Preview
# print(df_combined.head())
# print(df_combined.shape)


# Shuffle df
df_combined_shuffled = df_combined.sample(frac=1, random_state=42).reset_index(drop=True)
# df_combined_shuffled.head()

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
import tensorflow as tf

# === Load embeddings and labels ===
emb_cancer = np.load("/home/azureuser/dna_sequencing/model_training/embeddings_backw_can.npy")
ids_cancer = pd.read_csv("/home/azureuser/dna_sequencing/model_training/backw_can_embeddings_ids.csv")["id"]
labels_cancer = np.ones(len(ids_cancer), dtype=int)

emb_noncan = np.load("/home/azureuser/dna_sequencing/model_training/embeddings_backw_noncan.npy")
ids_noncan = pd.read_csv("/home/azureuser/dna_sequencing/model_training/backw_noncan_embeddings_ids.csv")["id"]
labels_noncan = np.zeros(len(ids_noncan), dtype=int)

X = np.vstack([emb_cancer, emb_noncan]).astype(np.float32)
y = np.concatenate([labels_cancer, labels_noncan])
ids = pd.concat([ids_cancer, ids_noncan], ignore_index=True)

# === Shuffle and Split ===
df = pd.DataFrame({"id": ids, "label": y, "embedding": list(X)})
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

X_all = np.stack(df_shuffled["embedding"].values).astype(np.float32)
y_all = df_shuffled["label"].values.astype(np.uint8)

# Match the original validation split
_, X_val, _, y_val = train_test_split(
    X_all, y_all, test_size=0.2, stratify=y_all, random_state=42
)

# === Load best model ===
model = tf.keras.models.load_model("/home/azureuser/dna_sequencing/model_training/best_model.keras")

# === Predict and Evaluate ===
y_pred_probs = model.predict(X_val).ravel()
y_pred = (y_pred_probs > 0.5).astype(int)

print("📊 Classification Report:")
print(classification_report(y_val, y_pred))
print(f"📈 AUC-ROC: {roc_auc_score(y_val, y_pred_probs):.4f}")


2025-06-15 12:00:24.889977: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-06-15 12:02:37.305490: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


[1m7196/7196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 20ms/step
📊 Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.84      0.86    109641
           1       0.86      0.90      0.88    120612

    accuracy                           0.87    230253
   macro avg       0.88      0.87      0.87    230253
weighted avg       0.87      0.87      0.87    230253

📈 AUC-ROC: 0.9457
