In [1]:
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [3]:
# ---------------------------------------------------------------------
# 1. Load merged ClinVar + CADD dataset
# ---------------------------------------------------------------------
df = pd.read_csv("clinvar_data/clinvar_STRICT_with_CADD.csv")

# Ensure chromosome is string
df["CHROM"] = df["CHROM"].astype(str)

print("Loaded dataset with", len(df), "variants.")

Loaded dataset with 262509 variants.


  df = pd.read_csv("clinvar_data/clinvar_STRICT_with_CADD.csv")


In [4]:
# ---------------------------------------------------------------------
# 2. Chromosome-based splitting
# ---------------------------------------------------------------------
train_chroms = [str(c) for c in range(1, 23) if c not in [8, 18, 21]]
val_chroms   = ["8"]
test_chroms  = ["18", "21"]

train_df = df[df["CHROM"].isin(train_chroms)].reset_index(drop=True)
val_df   = df[df["CHROM"].isin(val_chroms)].reset_index(drop=True)
test_df  = df[df["CHROM"].isin(test_chroms)].reset_index(drop=True)

print("Training variants:", len(train_df))
print("Validation variants:", len(val_df))
print("Test variants:", len(test_df))

Training variants: 227989
Validation variants: 8803
Test variants: 8461


In [9]:
# 3. Select features + label
#    For the baseline, CADD_PHRED is used as the predictor.
# ---------------------------------------------------------------------
feature_cols = ["CADD_phred"]   # baseline single-feature model
label_col = "CLNSIG"            # ClinVar label

# Encode labels
le = LabelEncoder()
train_df["label"] = le.fit_transform(train_df[label_col])
val_df["label"]   = le.transform(val_df[label_col])
test_df["label"]  = le.transform(test_df[label_col])

X_train, y_train = train_df[feature_cols], train_df["label"]
X_val,   y_val   = val_df[feature_cols],   val_df["label"]
X_test,  y_test  = test_df[feature_cols],  test_df["label"]

# ---------------------------------------------------------------------
# Drop rows with missing feature values (LogReg requires no NaNs)
# ---------------------------------------------------------------------
train_df = train_df.dropna(subset=["CADD_phred"]).reset_index(drop=True)
val_df   = val_df.dropna(subset=["CADD_phred"]).reset_index(drop=True)
test_df  = test_df.dropna(subset=["CADD_phred"]).reset_index(drop=True)

print("After dropping NaNs:")
print("Training variants:", len(train_df))
print("Validation variants:", len(val_df))
print("Test variants:", len(test_df))

After dropping NaNs:
Training variants: 160889
Validation variants: 6235
Test variants: 6317


In [10]:
# ---------------------------------------------------------------------
# 4. Train baseline Logistic Regression classifier
# ---------------------------------------------------------------------
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(
    max_iter=5000,
    class_weight="balanced",
    solver="lbfgs"
)

clf.fit(X_train, y_train)
print("\nLogistic Regression training complete.\n")


Logistic Regression training complete.



In [11]:
# ---------------------------------------------------------------------
# 5. Validation on chromosome 8
# ---------------------------------------------------------------------
val_pred = clf.predict(X_val)
val_prob = clf.predict_proba(X_val)[:, 1]

print("=== VALIDATION (Chromosome 8) ===")
print("Accuracy:", accuracy_score(y_val, val_pred))
print("ROC AUC:", roc_auc_score(y_val, val_prob))
print(classification_report(y_val, val_pred, target_names=le.classes_))

=== VALIDATION (Chromosome 8) ===
Accuracy: 0.9292702485966319
ROC AUC: 0.9867197292731855
              precision    recall  f1-score   support

      Benign       1.00      0.93      0.96      5874
  Pathogenic       0.45      0.96      0.61       361

    accuracy                           0.93      6235
   macro avg       0.72      0.94      0.79      6235
weighted avg       0.97      0.93      0.94      6235



In [12]:
# ---------------------------------------------------------------------
# 6a. Final test on chromosomes 18 & 21 together
# ---------------------------------------------------------------------
test_pred = clf.predict(X_test)
test_prob = clf.predict_proba(X_test)[:, 1]

print("\n=== TEST SET (Chromosomes 18 & 21) ===")
print("Accuracy:", accuracy_score(y_test, test_pred))
print("ROC AUC:", roc_auc_score(y_test, test_prob))
print(classification_report(y_test, test_pred, target_names=le.classes_))


=== TEST SET (Chromosomes 18 & 21) ===
Accuracy: 0.9368371062213076
ROC AUC: 0.9889834174400092
              precision    recall  f1-score   support

      Benign       1.00      0.93      0.97      6006
  Pathogenic       0.44      0.98      0.60       311

    accuracy                           0.94      6317
   macro avg       0.72      0.96      0.78      6317
weighted avg       0.97      0.94      0.95      6317



In [13]:
# ---------------------------------------------------------------------
# 6b. Evaluate chromosome 18 and 21 separately
# ---------------------------------------------------------------------

# Filter subsets
test_chr18 = test_df[test_df["CHROM"] == "18"].reset_index(drop=True)
test_chr21 = test_df[test_df["CHROM"] == "21"].reset_index(drop=True)

X_18, y_18 = test_chr18[feature_cols], test_chr18["label"]
X_21, y_21 = test_chr21[feature_cols], test_chr21["label"]

# Predictions
pred_18 = clf.predict(X_18)
prob_18 = clf.predict_proba(X_18)[:, 1]

pred_21 = clf.predict(X_21)
prob_21 = clf.predict_proba(X_21)[:, 1]

# Results
print("\n=== TEST: Chromosome 18 ===")
print("Accuracy:", accuracy_score(y_18, pred_18))
print("ROC AUC:", roc_auc_score(y_18, prob_18))
print(classification_report(y_18, pred_18, target_names=le.classes_))

print("\n=== TEST: Chromosome 21 ===")
print("Accuracy:", accuracy_score(y_21, pred_21))
print("ROC AUC:", roc_auc_score(y_21, prob_21))
print(classification_report(y_21, pred_21, target_names=le.classes_))


=== TEST: Chromosome 18 ===
Accuracy: 0.9422476586888657
ROC AUC: 0.9917352376716334
              precision    recall  f1-score   support

      Benign       1.00      0.94      0.97      3679
  Pathogenic       0.43      0.98      0.59       165

    accuracy                           0.94      3844
   macro avg       0.71      0.96      0.78      3844
weighted avg       0.97      0.94      0.95      3844


=== TEST: Chromosome 21 ===
Accuracy: 0.9284270117266478
ROC AUC: 0.9855787627081728
              precision    recall  f1-score   support

      Benign       1.00      0.93      0.96      2327
  Pathogenic       0.45      0.97      0.62       146

    accuracy                           0.93      2473
   macro avg       0.72      0.95      0.79      2473
weighted avg       0.97      0.93      0.94      2473



In [15]:
# ---------------------------------------------------------------------
# 7. Save test predictions (combined + separate chromosomes)
# ---------------------------------------------------------------------

# Combined test set
test_df["predicted_label"] = le.inverse_transform(test_pred)
test_df["prob_pathogenic"] = test_prob
test_df.to_csv("baseline_predictions_testset_combined.csv", index=False)
print("Saved combined test predictions to: baseline_predictions_testset_combined.csv")

# Chromosome 18
test_chr18["predicted_label"] = le.inverse_transform(pred_18)
test_chr18["prob_pathogenic"] = prob_18
test_chr18.to_csv("baseline_predictions_chr18.csv", index=False)
print("Saved chr18 predictions to: baseline_predictions_chr18.csv")

# Chromosome 21
test_chr21["predicted_label"] = le.inverse_transform(pred_21)
test_chr21["prob_pathogenic"] = prob_21
test_chr21.to_csv("baseline_predictions_chr21.csv", index=False)
print("Saved chr21 predictions to: baseline_predictions_chr21.csv")

Saved combined test predictions to: baseline_predictions_testset_combined.csv
Saved chr18 predictions to: baseline_predictions_chr18.csv
Saved chr21 predictions to: baseline_predictions_chr21.csv
