# Donor-level Splits (GSE131907)


Create deterministic train/val/test partitions at the patient level using the tokenisation metadata.

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/singlecell-tumor-classification

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/geneformer-tumor-classification


In [20]:
#%pip uninstall -y jax jaxlib ml-dtypes

In [21]:
# Dependencies
!pip install --quiet -r requirements.txt

In [22]:
from pathlib import Path
import numpy as np
import pandas as pd


In [23]:
TOKEN_DIR = Path("gse131907/processed/tokens")
META_PATH = TOKEN_DIR / "gse131907_tokens_metadata.tsv"
SPLIT_NPZ = TOKEN_DIR / "splits_by_patient.npz"
SPLIT_SUMMARY = TOKEN_DIR / "splits_summary.tsv"

meta = pd.read_csv(META_PATH, sep="\t")
print(f"Loaded metadata: {META_PATH}")
display(meta.head())


Loaded metadata: gse131907/processed/tokens/gse131907_tokens_metadata.tsv


Unnamed: 0,Patient,Sample,Class,BinaryClass,token_length
0,LN_05,LN_05,nLN,Normal,1690
1,NS_13,NS_13,mBrain,Tumor,611
2,LUNG_N18,LUNG_N18,nLung,Normal,1113
3,LUNG_N18,LUNG_N18,nLung,Normal,1100
4,LN_04,LN_04,nLN,Normal,948


In [28]:
patients = meta["Patient"].astype(str).unique().tolist()
rng = np.random.default_rng(42)
rng.shuffle(patients)

n = len(patients)  # should be 58
n_train = int(n * 0.8)  # 46
n_val = int(n * 0.1)    # 5
train_patients = patients[:n_train]
val_patients = patients[n_train:n_train + n_val]
test_patients = patients[n_train + n_val:]  # 7

def idx_for(group):
    return meta.index[meta["Patient"].isin(group)].to_numpy(dtype=np.int64)

train_idx = idx_for(train_patients)
val_idx = idx_for(val_patients)
test_idx = idx_for(test_patients)

np.savez("gse131907/processed/tokens/splits_by_patient.npz",
        train_idx=train_idx, val_idx=val_idx, test_idx=test_idx,
        train_patients=train_patients, val_patients=val_patients, test_patients=test_patients)

summary = meta.groupby(["Patient", "Class"]).size().reset_index(name="count")
summary["split"] = summary["Patient"].apply(lambda p: "train" if p in train_patients else "val" if p in val_patients
else "test")
summary = summary[["split", "Patient", "Class", "count"]]
summary.to_csv("gse131907/processed/tokens/splits_summary.tsv", sep="\t", index=False)

print("Patients:", len(patients), "train/val/test =", len(train_patients), len(val_patients), len(test_patients))
print("Cells:", len(train_idx), len(val_idx), len(test_idx))

Patients: 58 train/val/test = 46 5 7
Cells: 164672 17956 25878


In [25]:
# Human-readable summary for quick inspection
summary = (
    meta.assign(split=np.where(meta.index.isin(train_idx), 'train',
                        np.where(meta.index.isin(val_idx), 'val', 'test')))
       .groupby(['split', 'Patient', 'Class']).size().rename('n').reset_index()
)
summary.to_csv(SPLIT_SUMMARY, sep='\t', index=False)
print(f"Wrote {SPLIT_SUMMARY}")
display(summary.head(12))


Wrote gse131907/processed/tokens/splits_summary.tsv


Unnamed: 0,split,Patient,Class,n
0,test,EFFUSION_11,PE,2943
1,test,LUNG_N09,nLung,2528
2,test,LUNG_N18,nLung,4628
3,test,NS_04,mBrain,1904
4,test,NS_07,mBrain,5730
5,test,NS_13,mBrain,4845
6,test,NS_19,mBrain,3300
7,train,BRONCHO_58,tL/B,2813
8,train,EBUS_06,tL/B,2303
9,train,EBUS_10,mLN,5144
