In [1]:
import pandas as pd
import os

In [None]:
# You can use Kaggle to install Trident and utilize Kaggle's GPU to extract features from TCGA
# Refer to this notebook: https://www.kaggle.com/code/yuvalfriedmann/submission
# You can directly download the TCGA dataset for this benchmark from:
# https://www.kaggle.com/datasets/yuvalfriedmann/tcga-glioma-molecular-classification
root_benchmark_files = '.../path/to/trident_processed'

# Prepare DataFrames (From Trident Preprocessing)

In [3]:
# Train embedding paths
path_dict = {p.name.split(".")[0] :p.path for p in os.scandir(os.path.join(
    root_benchmark_files, '20x_256px_0px_overlap/features_uni_v1'))}

# Test embedding paths
path_dict.update(
    {p.name.split(".")[0] :p.path for p in os.scandir(os.path.join(
    root_benchmark_files, '20x_256px_0px_overlap/features_uni_v1'))}
)

# Gather all paths
df_paths = pd.Series(path_dict, name='path')
df_paths.index.name = 'id'
df_paths = df_paths.to_frame().reset_index()

In [4]:
# Read Train data
df_train = pd.read_csv(os.path.join(root_benchmark_files, "train.csv"))
df_train['target'] = df_train['mIDH'].astype(int)
df_train['id'] = df_train['slide_id']
df_train = df_train[['id', 'case_id', 'target']]
# Retrieve embedding path per ID
df_train = df_train.merge(df_paths, on='id', how='left')

# Read Test data
df_test = pd.read_csv(os.path.join(root_benchmark_files, "test.csv"))
df_test['target'] = df_test['mIDH'].astype(int)
df_test['id'] = df_test['slide_id']
df_test = df_test[['id', 'case_id', 'target']]

# Retrieve embedding path per ID
df_test = df_test.merge(df_paths, on='id', how='left')

In [5]:
df_train.to_csv("./train_tcga_IDH.csv", index=False)
df_test.to_csv("./test_tcga_IDH.csv", index=False)

# Run KFold Training

In [6]:
! python train_kfold.py --csv train_tcga_IDH.csv --format h5 --cache --workers 4 --k 3 --track_samples --seed 1 \
    --results_dir ./kfold_tcga_IDH  --embedding_size 1024 --bag_size 4096 --mlp_dim 32 --D 32 --dropout 0.3 --epochs 5 \
    --lr 0.0001 --wd 0.001


GPU is ON
{
    "csv": "train_tcga_IDH.csv",
    "k": 3,
    "track_samples": true,
    "seed": 1,
    "no_cuda": false,
    "early_stopping": false,
    "workers": 4,
    "results_dir": "./kfold_tcga_IDH",
    "cache": true,
    "format": "h5",
    "embedding_size": 1024,
    "bag_size": 4096,
    "case_bag": false,
    "mlp_dim": 32,
    "D": 32,
    "dropout": 0.3,
    "no_gate": false,
    "w_loss": false,
    "epochs": 5,
    "lr": 0.0001,
    "wd": 0.001,
    "t": 1,
    "l2": null,
    "accum": 1,
    "threshold": "ROC"
}
Slide counts:
 {1: 51, 0: 49}
Case counts:
 {0: 49, 1: 51}
FOLD 0                                                                          
--------------------------------
===== Epoch 0
train: loss=0.6678 AUC=0.6702
valid: loss=0.5867 AUC=0.9481
===== Epoch 1
train: loss=0.5387 AUC=0.9198
valid: loss=0.4780 AUC=0.9741
===== Epoch 2
train: loss=0.4301 AUC=0.9492
valid: loss=0.4020 AUC=0.9778
===== Epoch 3
train: loss=0.3512 AUC=0.9519
valid: loss=0.3519 AUC=0.

In [10]:
# Training output
! ls ./kfold_tcga_IDH/train_tcga_IDH/2025-03-23_17-07-54

auc.png		       config.json  fold0  fold2     ROC.png
balanced_accuracy.png  f1.png	    fold1  loss.png  slides.csv


# Infer on Internal Test set (TCGA n=25)

In [11]:
! python inference.py --csv test_tcga_IDH.csv --target_root ./test_tcga_IDH --fold 1 --results_dir \
    ./kfold_tcga_IDH/train_tcga_IDH/2025-03-23_15-47-29

Config:
{
    "csv": "test_tcga_IDH.csv",
    "results_dir": "./kfold_tcga_IDH/train_tcga_IDH/2025-03-23_15-47-29",
    "target_root": "./test_tcga_IDH",
    "device": "cuda:0",
    "fold": 1,
    "workers": 0,
    "seed": 1,
    "case_bag": false,
    "no_attn": false,
    "heatmap": false,
    "patches": null,
    "raw": null,
    "ext": "tiff"
}
Slide counts:
 {0: 13, 1: 12}
Case counts:
 {0: 13, 1: 12}
loaded ckpt
 best epoch: -1 threshold: 0.49
Predict: 100%|██████████████████████████████████| 25/25 [00:16<00:00,  1.54it/s]
log_loss
0.48504702484639667

roc_auc_score
0.8717948717948718

balanced_accuracy_score
0.8044871794871795

f1_score
0.8148148148148148

confusion_matrix
[[ 9  4]
 [ 1 11]]

Results saved to ./test_tcga_IDH


# Infer on External Test set (proprietary n=25)

In [14]:
! python inference.py --csv test_proprietary_IDH.csv --target_root ./test_tcga_to_proprietary_IDH --fold 1 --results_dir \
    ./kfold_tcga_IDH/train_tcga_IDH/2025-03-23_15-47-29

Config:
{
    "csv": "test_sheba_IDH.csv",
    "results_dir": "./kfold_tcga_IDH/train_tcga_IDH/2025-03-23_15-47-29",
    "target_root": "./test_tcga_to_sheba_IDH",
    "device": "cuda:0",
    "fold": 1,
    "workers": 0,
    "seed": 1,
    "case_bag": false,
    "no_attn": false,
    "heatmap": false,
    "patches": null,
    "raw": null,
    "ext": "tiff"
}
Slide counts:
 {0: 13, 1: 12}
Case counts:
 {0: 13, 1: 12}
loaded ckpt
 best epoch: -1 threshold: 0.49
Predict: 100%|██████████████████████████████████| 25/25 [00:22<00:00,  1.14it/s]
log_loss
0.45080904343507816

roc_auc_score
0.9102564102564102

balanced_accuracy_score
0.8814102564102564

f1_score
0.88

confusion_matrix
[[11  2]
 [ 1 11]]

Results saved to ./test_tcga_to_sheba_IDH


In [15]:
# Inference output
! ls ./test_tcga_to_proprietary_IDH

attention_scores.npy  predictions.csv  slides.csv
