In [2]:
from google.colab import drive
drive.mount('/content/drive')

!ls "/content/drive/MyDrive/Stanford/CS229/Final Project/llama+udmp-densityclustering_synthetic cases"

Mounted at /content/drive
'3-4-UMAP based GNN classifier.py'
 synthetic_patients_internal_test_cohort.csv
 synthetic_patients_internal_test_cohort.jsonl
 synthetic_patients_training_cohort.csv
 synthetic_patients_training_cohort.jsonl
'test-cohort-3-4-UMAP-GNN validation.py'
'test-cohort-llama-density-clusters-case-aggregated-vectors (1).gsheet'
 test-cohort-llama-density-clusters-case-aggregated-vectors.csv
 test-cohort-llama-density-clusters-case-aggregated-vectors.gsheet
 test-cohort-llama-density-clusters-case-aggregated-vectors.jsonl
 test-cohort-llama-density-UMAP-GNN-Validation-output.log
 XGBoost.ipynb


# Step 1 Set up helpers And Load Data

In [15]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
from scipy.sparse import lil_matrix, csr_matrix, hstack

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, top_k_accuracy_score

from xgboost import XGBClassifier

In [16]:
# Paths to your new density-clustering datasets
train_path = Path("/content/drive/MyDrive/Stanford/CS229/Final Project/llama+udmp-densityclustering_synthetic cases/synthetic_patients_training_cohort.jsonl")
test_path  = Path("/content/drive/MyDrive/Stanford/CS229/Final Project/llama+udmp-densityclustering_synthetic cases/synthetic_patients_internal_test_cohort.jsonl")

def load_jsonl(path: Path) -> pd.DataFrame:
    records = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            records.append(json.loads(line))
    return pd.DataFrame(records)

df_train = load_jsonl(train_path)
df_test  = load_jsonl(test_path)

print("Train shape:", df_train.shape)
print("Test shape:", df_test.shape)
df_train.head(1)

Train shape: (132800, 6)
Test shape: (34860, 6)


Unnamed: 0,patient_id,diagnosis,symptom_completeness_pct,symptoms_clusters,symptoms_vectors_100d,cohort_type
0,"2,4-DIENOYL-CoA REDUCTASE DEFICIENCY; DECRD_P20","2,4-DIENOYL-CoA REDUCTASE DEFICIENCY; DECRD",60.0,"[4529, 852, 2001, 68, 4530, 308, 1187, 1087, 2...","[[-0.27515015, 0.61182046, -1.103897, -1.42158...",training_cohort


# Step 2: Build a cluster vocabulary (from training set only)



We map cluster IDs → column indices so we get a fixed-length multi-hot vector for each patient.

In [17]:
# Collect all unique cluster IDs from training data
cluster_set = set()
for clusters in df_train["symptoms_clusters"]:
    cluster_set.update(clusters)

cluster_list = sorted(cluster_set)
cluster2idx = {cid: i for i, cid in enumerate(cluster_list)}

n_clusters = len(cluster2idx)
print("Number of unique clusters (train):", n_clusters)

Number of unique clusters (train): 5362


In [18]:
# save this file for reproducibility
with open("density_cluster_vocab.json", "w") as f:
    json.dump({"cluster_list": cluster_list}, f)

# Step 3: Feature Builder
We’ll construct:


*   X_clusters : multi-hot over cluster IDs (sparse)
*   X_embed    : mean of symptoms_vectors_100d per patient (dense 100-d)
*   X_extra    : symptom_completeness_pct (scaled to 0–1)

Then we hstack them into a single sparse matrix for XGBoost.

In [19]:
def build_features(df: pd.DataFrame, cluster2idx: dict) -> csr_matrix:
    n_samples = len(df)
    n_clusters = len(cluster2idx)

    # multi-hot cluster features (sparse)
    X_clusters = lil_matrix((n_samples, n_clusters), dtype=np.float32)

    # infer embedding dim from first row
    first_vecs = df.iloc[0]["symptoms_vectors_100d"]
    embedding_dim = len(first_vecs[0])
    X_embed = np.zeros((n_samples, embedding_dim), dtype=np.float32)

    # extra scalar feature: completeness
    X_compl = np.zeros((n_samples, 1), dtype=np.float32)

    for i, row in df.iterrows():
        # multi-hot over clusters
        for cid in row["symptoms_clusters"]:
            j = cluster2idx.get(cid)
            if j is not None:
                X_clusters[i, j] = 1.0

        # mean embedding over all symptoms for this patient
        vecs = np.array(row["symptoms_vectors_100d"], dtype=np.float32)  # (num_symptoms, 100)
        X_embed[i] = vecs.mean(axis=0)

        # completeness in [0,1]
        X_compl[i, 0] = float(row["symptom_completeness_pct"]) / 100.0

    # Convert to csr and hstack dense features as sparse
    X_clusters = X_clusters.tocsr()
    X_dense = np.hstack([X_embed, X_compl])  # shape: (n_samples, 100 + 1)
    X_dense_sparse = csr_matrix(X_dense)

    X = hstack([X_clusters, X_dense_sparse], format="csr")
    return X

Build X_train and X_test

In [20]:
X_train = build_features(df_train, cluster2idx)
X_test  = build_features(df_test, cluster2idx)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

X_train shape: (132800, 5463)
X_test shape: (34860, 5463)


# Step 4: Encode labels

In [21]:
le = LabelEncoder()
y_train = le.fit_transform(df_train["diagnosis"])
y_test  = le.transform(df_test["diagnosis"])

num_classes = len(le.classes_)
print("Number of classes:", num_classes)

Number of classes: 1660


# Step 5: Train XGBoost

In [22]:
import xgboost as xgb
print("XGBoost version:", xgb.__version__)

XGBoost version: 3.1.2


TypeError: TrainingCheckPoint.__init__() got an unexpected keyword argument 'iterations'

In [26]:
import os, re, glob, json, pickle
import numpy as np
from pathlib import Path
from collections import Counter
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import xgboost as xgb
from xgboost import callback

# ========= CONFIG =========
SAVE_DIR = Path("/content/drive/MyDrive/Stanford/CS229/Final Project/llama+udmp-densityclustering_synthetic cases/xgb_checkpoints")
SAVE_DIR.mkdir(parents=True, exist_ok=True)

NUM_CLASS = len(le.classes_)     # from your LabelEncoder on y_train
N_BOOST_ROUNDS = 1000            # cap on total trees
EARLY_STOPPING_ROUNDS = 100      # stop if no improvement
CHECKPOINT_EVERY = 250           # save every N rounds
RANDOM_STATE = 42
N_SPLITS = 5
# ==========================

# ---- Version-agnostic periodic saver ----
class PeriodicSaver(callback.TrainingCallback):
    """Save booster every `every` rounds to SAVE_DIR / f'{name}_{iter:04d}.json'."""
    def __init__(self, save_dir: Path, name: str, every: int):
        self.save_dir = Path(save_dir)
        self.name = name
        self.every = int(every)

    def after_iteration(self, model, epoch: int, evals_log) -> bool:
        it = epoch + 1
        if it % self.every == 0:
            path = self.save_dir / f"{self.name}_{it:04d}.json"
            model.save_model(str(path))
            print(f"[checkpoint] saved {path}")
        return False  # continue training

def latest_checkpoint_for_fold(save_dir: Path, fold: int):
    """Return path to latest checkpoint (json) for this fold, or None."""
    pattern = str(save_dir / f"xgb_fold{fold}_*.json")
    candidates = glob.glob(pattern)
    if not candidates:
        return None
    best, best_iter = None, -1
    rx = re.compile(rf"xgb_fold{fold}_(\d+)\.json$")
    for p in candidates:
        m = rx.search(os.path.basename(p))
        if m:
            it = int(m.group(1))
            if it > best_iter:
                best_iter, best = it, p
    return best

# Adjust folds if the rarest class is too small
min_per_class = min(Counter(y_train_enc).values())
actual_splits = min(N_SPLITS, max(2, min_per_class))
print(f"Using {actual_splits}-fold CV (min per class = {min_per_class})")

kf = StratifiedKFold(n_splits=actual_splits, shuffle=True, random_state=RANDOM_STATE)
fold_accs = []

# Common XGBoost params
params = {
    "objective": "multi:softprob",
    "num_class": NUM_CLASS,
    "eval_metric": "mlogloss",
    "max_depth": 4,
    "learning_rate": 0.1,
    "subsample": 0.9,
    "colsample_bytree": 0.9,
    "reg_lambda": 1.0,
    "tree_method": "hist",
    "device": "cuda",
    "seed": RANDOM_STATE,
}

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train_enc), 1):
    print(f"\n===== Fold {fold} =====")
    X_tr, X_va = X_train[train_idx], X_train[val_idx]
    y_tr, y_va = y_train_enc[train_idx], y_train_enc[val_idx]

    dtrain = xgb.DMatrix(X_tr, label=y_tr)
    dvalid = xgb.DMatrix(X_va, label=y_va)
    evals = [(dtrain, "train"), (dvalid, "valid")]

    resume_path = latest_checkpoint_for_fold(SAVE_DIR, fold)
    if resume_path:
        print(f"➡️  Resuming from checkpoint: {resume_path}")
    else:
        print("➡️  No checkpoint found; starting fresh.")

    saver_cb = PeriodicSaver(SAVE_DIR, name=f"xgb_fold{fold}", every=CHECKPOINT_EVERY)

    # Train (resume if checkpoint exists)
    booster = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=N_BOOST_ROUNDS,
        evals=evals,
        early_stopping_rounds=EARLY_STOPPING_ROUNDS,
        callbacks=[saver_cb],
        xgb_model=resume_path if resume_path else None,
        verbose_eval=True
    )

    # Save final model (JSON, portable)
    final_json = SAVE_DIR / f"xgb_fold{fold}_final.json"
    booster.save_model(str(final_json))
    print(f"✅ Saved final model: {final_json}")

    # Predict on this fold's holdout and score
    y_prob = booster.predict(dvalid, iteration_range=(0, booster.best_iteration + 1) if booster.best_iteration is not None else None)
    y_pred = np.argmax(y_prob, axis=1)
    acc = accuracy_score(y_va, y_pred)
    fold_accs.append(acc)
    print(f"Fold {fold} accuracy: {acc:.4f}")
    if booster.best_iteration is not None:
        print(f"Best iteration (early stop): {booster.best_iteration}")

mean_acc, std_acc = np.mean(fold_accs), np.std(fold_accs)
print(f"\nMean CV accuracy: {mean_acc:.4f} ± {std_acc:.4f}")
print(f"All checkpoints & final models in: {SAVE_DIR}")


Using 5-fold CV (min per class = 80)

===== Fold 1 =====
➡️  No checkpoint found; starting fresh.
[0]	train-mlogloss:5.86840	valid-mlogloss:5.96094
[1]	train-mlogloss:4.41167	valid-mlogloss:4.59711
[2]	train-mlogloss:3.15195	valid-mlogloss:3.42456
[3]	train-mlogloss:2.59658	valid-mlogloss:2.89889
[4]	train-mlogloss:2.23119	valid-mlogloss:2.55833
[5]	train-mlogloss:1.95089	valid-mlogloss:2.29762
[6]	train-mlogloss:1.72572	valid-mlogloss:2.08731
[7]	train-mlogloss:1.53929	valid-mlogloss:1.91219
[8]	train-mlogloss:1.38104	valid-mlogloss:1.76249
[9]	train-mlogloss:1.24494	valid-mlogloss:1.63267
[10]	train-mlogloss:1.12656	valid-mlogloss:1.51955
[11]	train-mlogloss:1.02236	valid-mlogloss:1.41806
[12]	train-mlogloss:0.93034	valid-mlogloss:1.32792
[13]	train-mlogloss:0.84830	valid-mlogloss:1.24639
[14]	train-mlogloss:0.77484	valid-mlogloss:1.17253
[15]	train-mlogloss:0.70880	valid-mlogloss:1.10521
[16]	train-mlogloss:0.64935	valid-mlogloss:1.04387
[17]	train-mlogloss:0.59559	valid-mlogloss:0.

# Step 6: Testing Validation

In [27]:
import xgboost as xgb
from pathlib import Path
import numpy as np

SAVE_DIR = Path("/content/drive/MyDrive/Stanford/CS229/Final Project/llama+udmp-densityclustering_synthetic cases/xgb_checkpoints")

# find all final models
final_models = sorted(SAVE_DIR.glob("xgb_fold*_final.json"))
print(f"Found {len(final_models)} fold models")

Found 5 fold models


In [31]:
dval = xgb.DMatrix(X_test, label=y_test)

In [32]:
probs = []
for path in final_models:
    booster = xgb.Booster()
    booster.load_model(str(path))
    best_it = getattr(booster, "best_iteration", None)
    y_prob = booster.predict(dval, iteration_range=(0, best_it + 1) if best_it is not None else None)
    probs.append(y_prob)

# average across folds
y_prob_mean = np.mean(probs, axis=0)
y_pred = np.argmax(y_prob_mean, axis=1)

In [33]:
# print(y_prob_mean)
print(y_prob_mean.shape)
print(y_pred.shape)

(34860, 1660)
(34860,)


In [34]:
from sklearn.metrics import accuracy_score, confusion_matrix

val_acc = accuracy_score(y_test, y_pred)
print(f"Final validation accuracy (ensemble of 5 folds): {val_acc:.4f}")

# (Optional) confusion matrix if you want to inspect per-class results
# cm = confusion_matrix(y_val_enc, y_pred)
# print(cm)


Final validation accuracy (ensemble of 5 folds): 0.9898


# Step 7: External Validation for PubMed and Clinical

In [41]:
import json
from pathlib import Path
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score, top_k_accuracy_score

# -------------------------------------------------------------------
# 7.1 Load external JSONL cohort
# -------------------------------------------------------------------
external_path = Path("/content/drive/MyDrive/Stanford/CS229/Final Project/llama+udmp-densityclustering_synthetic cases/test-cohort-llama-density-clusters-case-aggregated-vectors.jsonl")

def load_jsonl(path: Path) -> pd.DataFrame:
    records = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            records.append(json.loads(line))
    return pd.DataFrame(records)

df_ext = load_jsonl(external_path)
print("External cohort shape:", df_ext.shape)
print("External columns:", df_ext.columns.tolist())

# Check case group distribution (assuming column 'case_group')
print("\ncase_group value counts:")
print(df_ext["case_group"].value_counts())


External cohort shape: (208, 5)
External columns: ['case_number', 'diagnosis', 'case_group', 'list_of_cluster_label_final', 'list_of_umap_vectors_100d']

case_group value counts:
case_group
2    107
1    101
Name: count, dtype: int64


In [42]:
# Keep only diagnoses that exist in training label space
mask_known = df_ext["diagnosis"].isin(le.classes_)
if not mask_known.all():
    n_dropped = (~mask_known).sum()
    print(f"⚠ Dropping {n_dropped} external samples with unseen diagnoses.")
    df_ext = df_ext[mask_known].reset_index(drop=True)




⚠ Dropping 16 external samples with unseen diagnoses.


In [45]:
print("External cohort after dropping unseen diagnoses:", df_ext.shape)

External cohort after dropping unseen diagnoses: (192, 5)


In [44]:
print(df_ext.columns.tolist())
print(df_ext.iloc[0])

['case_number', 'diagnosis', 'case_group', 'list_of_cluster_label_final', 'list_of_umap_vectors_100d']
case_number                                                                    1
diagnosis                           CHROMOSOME 22q11.2 DELETION SYNDROME, DISTAL
case_group                                                                     2
list_of_cluster_label_final              [20, 31, 108, 159, 303, 707, 931, 4592]
list_of_umap_vectors_100d      [[0.019151507, 2.960222, -0.036849596, -0.5038...
Name: 0, dtype: object


In [46]:
def build_features_external(df: pd.DataFrame, cluster2idx: dict) -> csr_matrix:
    n_samples = len(df)
    n_clusters = len(cluster2idx)

    # sparse multi-hot for clusters
    X_clusters = lil_matrix((n_samples, n_clusters), dtype=np.float32)

    # infer embedding dim from first row of list_of_umap_vectors_100d
    first_vec = df.iloc[0]["list_of_umap_vectors_100d"]
    arr = np.array(first_vec, dtype=np.float32)
    if arr.ndim == 1:
        embedding_dim = arr.shape[0]
    elif arr.ndim == 2:
        embedding_dim = arr.shape[1]
    else:
        raise ValueError(f"Unexpected shape for list_of_umap_vectors_100d: {arr.shape}")

    X_embed = np.zeros((n_samples, embedding_dim), dtype=np.float32)
    X_compl = np.ones((n_samples, 1), dtype=np.float32)  # set completeness = 1.0 for all

    for i, row in df.iterrows():
        # multi-hot clusters from list_of_cluster_label_final
        for cid in row["list_of_cluster_label_final"]:
            j = cluster2idx.get(cid)
            if j is not None:       # ignore clusters never seen in training
                X_clusters[i, j] = 1.0

        # UMAP vectors (flat 100-d or list of 100-d vectors)
        vec = np.array(row["list_of_umap_vectors_100d"], dtype=np.float32)
        if vec.ndim == 1:
            mean_vec = vec
        elif vec.ndim == 2:
            mean_vec = vec.mean(axis=0)
        else:
            raise ValueError(f"Row {i}: unexpected shape for list_of_umap_vectors_100d: {vec.shape}")
        X_embed[i] = mean_vec

        # X_compl[i, 0] is already 1.0

    X_clusters = X_clusters.tocsr()
    X_dense = np.hstack([X_embed, X_compl])   # (n_samples, 100 + 1)
    X_dense_sparse = csr_matrix(X_dense)

    X = hstack([X_clusters, X_dense_sparse], format="csr")
    return X

# Build external features
X_ext = build_features_external(df_ext, cluster2idx)
print("X_ext shape:", X_ext.shape)

# Encode labels
y_ext = le.transform(df_ext["diagnosis"])

# DMatrix for XGBoost
d_ext = xgb.DMatrix(X_ext, label=y_ext)

X_ext shape: (192, 5463)


In [47]:
probas_list = []

for fold in range(1, actual_splits + 1):
    model_path = SAVE_DIR / f"xgb_fold{fold}_final.json"
    if not model_path.exists():
        raise FileNotFoundError(f"Model for fold {fold} not found: {model_path}")
    print(f"Loading fold {fold} model from: {model_path}")

    booster = xgb.Booster()
    booster.load_model(str(model_path))

    y_proba_fold = booster.predict(d_ext)
    probas_list.append(y_proba_fold)

# Average probabilities across folds
y_proba_ext = np.mean(probas_list, axis=0)   # (n_ext, NUM_CLASS)
y_pred_ext  = np.argmax(y_proba_ext, axis=1)

Loading fold 1 model from: /content/drive/MyDrive/Stanford/CS229/Final Project/llama+udmp-densityclustering_synthetic cases/xgb_checkpoints/xgb_fold1_final.json
Loading fold 2 model from: /content/drive/MyDrive/Stanford/CS229/Final Project/llama+udmp-densityclustering_synthetic cases/xgb_checkpoints/xgb_fold2_final.json
Loading fold 3 model from: /content/drive/MyDrive/Stanford/CS229/Final Project/llama+udmp-densityclustering_synthetic cases/xgb_checkpoints/xgb_fold3_final.json
Loading fold 4 model from: /content/drive/MyDrive/Stanford/CS229/Final Project/llama+udmp-densityclustering_synthetic cases/xgb_checkpoints/xgb_fold4_final.json
Loading fold 5 model from: /content/drive/MyDrive/Stanford/CS229/Final Project/llama+udmp-densityclustering_synthetic cases/xgb_checkpoints/xgb_fold5_final.json


In [50]:
results = {}
NUM_CLASS = len(le.classes_)  # already defined earlier

for group_value, group_name in [(1, "PubMed (case_group=1)"),
                                (2, "Stanford (case_group=2)")]:
    mask = (df_ext["case_group"] == group_value).values
    if mask.sum() == 0:
        print(f"\n⚠ No samples for group {group_value}, skipping.")
        continue

    y_true_g = y_ext[mask]
    y_proba_g = y_proba_ext[mask]

    # labels must match the column order of y_proba_g
    labels_all = np.arange(NUM_CLASS)

    top1  = accuracy_score(y_true_g, np.argmax(y_proba_g, axis=1))
    top10 = top_k_accuracy_score(y_true_g, y_proba_g, k=10, labels=labels_all)
    top20 = top_k_accuracy_score(y_true_g, y_proba_g, k=20, labels=labels_all)

    results[group_value] = {
        "top1": top1,
        "top10": top10,
        "top20": top20,
    }

    print(f"\n=== {group_name} ===")
    print(f"Samples: {mask.sum()}")
    print(f"Top-1  accuracy: {top1:.4f}")
    print(f"Top-10 accuracy: {top10:.4f}")
    print(f"Top-20 accuracy: {top20:.4f}")



=== PubMed (case_group=1) ===
Samples: 90
Top-1  accuracy: 0.0222
Top-10 accuracy: 0.0667
Top-20 accuracy: 0.0889

=== Stanford (case_group=2) ===
Samples: 102
Top-1  accuracy: 0.0196
Top-10 accuracy: 0.1176
Top-20 accuracy: 0.1765
