# Donor-level Splits (GSE144735)


Create deterministic train/val/test partitions at the patient level using the tokenisation metadata.

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/geneformer-tumor-classification

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/geneformer-tumor-classification


In [None]:
#%pip uninstall -y jax jaxlib ml-dtypes

[0m

In [None]:
# Dependencies
!pip install --quiet -r requirements.txt

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd


In [None]:
TOKEN_DIR = Path("gse144735/processed/tokens")
META_PATH = TOKEN_DIR / "gse144735_tokens_metadata.tsv"
SPLIT_NPZ = TOKEN_DIR / "splits_by_patient.npz"
SPLIT_SUMMARY = TOKEN_DIR / "splits_summary.tsv"

meta = pd.read_csv(META_PATH, sep="\t")
print(f"Loaded metadata: {META_PATH}")
display(meta.head())


Loaded metadata: gse144735/processed/tokens/gse144735_tokens_metadata.tsv


Unnamed: 0,Patient,Class,Sample,token_length
0,KUL01,Tumor,KUL01-T,2048
1,KUL01,Tumor,KUL01-T,627
2,KUL01,Tumor,KUL01-T,2048
3,KUL01,Tumor,KUL01-T,2048
4,KUL01,Tumor,KUL01-T,2048


In [None]:
# Deterministic 4/1/1 split over the 6 donors
rng = np.random.default_rng(42)
patients = meta['Patient'].unique()
rng.shuffle(patients)
train_p, val_p, test_p = patients[:4], patients[4:5], patients[5:6]

train_idx = meta.index[meta.Patient.isin(train_p)].to_numpy()
val_idx   = meta.index[meta.Patient.isin(val_p)].to_numpy()
test_idx  = meta.index[meta.Patient.isin(test_p)].to_numpy()

np.savez(SPLIT_NPZ,
         train_idx=train_idx, val_idx=val_idx, test_idx=test_idx,
         train_patients=train_p, val_patients=val_p, test_patients=test_p)

print({k: v.size for k, v in {'train': train_idx, 'val': val_idx, 'test': test_idx}.items()})
print({'train_patients': train_p.tolist(), 'val_patients': val_p.tolist(), 'test_patients': test_p.tolist()})


{'train': 13085, 'val': 8266, 'test': 6063}
{'train_patients': ['KUL28', 'KUL21', 'KUL31', 'KUL30'], 'val_patients': ['KUL19'], 'test_patients': ['KUL01']}


In [None]:
# Human-readable summary for quick inspection
summary = (
    meta.assign(split=np.where(meta.index.isin(train_idx), 'train',
                        np.where(meta.index.isin(val_idx), 'val', 'test')))
       .groupby(['split', 'Patient', 'Class']).size().rename('n').reset_index()
)
summary.to_csv(SPLIT_SUMMARY, sep='\t', index=False)
print(f"Wrote {SPLIT_SUMMARY}")
display(summary.head(12))


Wrote gse144735/processed/tokens/splits_summary.tsv


Unnamed: 0,split,Patient,Class,n
0,test,KUL01,Border,2129
1,test,KUL01,Normal,2012
2,test,KUL01,Tumor,1922
3,train,KUL21,Border,1741
4,train,KUL21,Normal,1340
5,train,KUL21,Tumor,2149
6,train,KUL28,Border,406
7,train,KUL28,Normal,908
8,train,KUL28,Tumor,428
9,train,KUL30,Border,766
