In [4]:
import os
import numpy as np
import pandas as pd
#import openslide
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score,confusion_matrix, balanced_accuracy_score
#import seaborn as sns

In [5]:
## read tcga data, the excel data is available at https://zenodo.org/records/11469546
train_df = pd.read_excel('tcga_processed.xlsx')
train_df

Unnamed: 0,patient ID,sample ID,2016_subtype,2016_grade,2016_molecular,2016_idh,2016_mgmt
0,TCGA-P5-A737,TCGA-P5-A737-01,oligodendroglioma,G2,idhmut-codel,idhmut,methylated
1,TCGA-E1-A7YI,TCGA-E1-A7YI-01,astrocytoma,G3,idhmut-non-codel,idhmut,methylated
2,TCGA-R8-A73M,TCGA-R8-A73M-01,oligodendroglioma,G2,idhmut-codel,idhmut,methylated
3,TCGA-QH-A6CV,TCGA-QH-A6CV-01,astrocytoma,G3,idhwt,idhwt,unmethylated
4,TCGA-TM-A7C5,TCGA-TM-A7C5-01,oligodendroglioma,G2,idhmut-codel,idhmut,methylated
...,...,...,...,...,...,...,...
613,TCGA-06-5414,TCGA-06-5414-01,glioblastoma,G4,idhwt,idhwt,unmethylated
614,TCGA-DB-5273,TCGA-DB-5273-01,astrocytoma,G3,idhmut-non-codel,idhmut,unmethylated
615,TCGA-06-0143,TCGA-06-0143-01,glioblastoma,G4,idhwt,idhwt,unmethylated
616,TCGA-27-1837,TCGA-27-1837-01,glioblastoma,G4,idhwt,idhwt,methylated


## create train_val splits based subtype on TCGA

In [7]:
set(data_csv['2016_subtype'].values) # 3 subtypes

{'astrocytoma', 'glioblastoma', 'oligodendroglioma'}

In [9]:
# train
## generate training data with 5 different random seeds
## "splits" has 5 split
## each split consists of [train_slide_list,train_slide_label,val_slide_list,val_slide_label]
train_df = data_csv
random_seed = [2,3,5,7,11]
if not os.path.exists('data_split/tcga_split_5f/'):
    os.makedirs('data_split/tcga_split_5f/')
train_slide_list = train_df['patient ID'].values
train_label_list = train_df['2016_subtype'].values 
print(f"train, ast:{sum(train_label_list=='astrocytoma')}, gbm:{sum(train_label_list=='glioblastoma')}, oli:{sum(train_label_list=='oligodendroglioma')}")

#print(f'train, 0:{sum(train_label_list==0)}, 1:{sum(train_label_list==1)}, 2:{sum(train_label_list==2)}, 3:{sum(train_label_list==3)}, 4:{sum(train_label_list==4)}')
for s_idx, seed in enumerate(random_seed):
    ## begin split
    splits = []
    skf = StratifiedKFold(n_splits=5,random_state=seed,shuffle=True)
    for train_idx,val_idx in skf.split(train_slide_list,train_label_list):
        train_x = train_slide_list[train_idx].tolist()
        train_y = train_label_list[train_idx].tolist()

        val_x = train_slide_list[val_idx].tolist()
        val_y = train_label_list[val_idx].tolist()

        splits.append([train_x,train_y,val_x,val_y])
    np.save(f'data_split/tcga_split_5f/tcga_split_5f_s{s_idx}.npy', np.array(splits, dtype=object))

train, ast:266, gbm:196, oli:156


In [18]:
splits = np.load(f'data_split/tcga_split_5f/tcga_split_5f_s0.npy',allow_pickle=True)

In [19]:
train_slides, train_labels, val_slides, val_labels = splits[0]

In [20]:
train_slides

['TCGA-E1-A7YI',
 'TCGA-QH-A6CV',
 'TCGA-TM-A7C5',
 'TCGA-41-4097',
 'TCGA-DU-7298',
 'TCGA-12-0826',
 'TCGA-06-6390',
 'TCGA-DU-6395',
 'TCGA-RR-A6KA',
 'TCGA-TM-A84B',
 'TCGA-14-0787',
 'TCGA-DH-A66B',
 'TCGA-HT-7687',
 'TCGA-HT-8010',
 'TCGA-DH-5141',
 'TCGA-HT-8105',
 'TCGA-HT-8018',
 'TCGA-RY-A847',
 'TCGA-TQ-A7RI',
 'TCGA-S9-A6TU',
 'TCGA-DB-A4X9',
 'TCGA-CS-6188',
 'TCGA-QH-A6X8',
 'TCGA-QH-A6X3',
 'TCGA-DB-A64R',
 'TCGA-19-1386',
 'TCGA-S9-A7R1',
 'TCGA-02-0024',
 'TCGA-FG-A6J3',
 'TCGA-HT-7854',
 'TCGA-06-0882',
 'TCGA-HT-7468',
 'TCGA-06-0139',
 'TCGA-DB-A64P',
 'TCGA-WY-A85D',
 'TCGA-DU-A6S7',
 'TCGA-12-1089',
 'TCGA-DU-A5TP',
 'TCGA-S9-A7QW',
 'TCGA-12-1602',
 'TCGA-DB-A75P',
 'TCGA-QH-A6CU',
 'TCGA-12-1598',
 'TCGA-S9-A6U6',
 'TCGA-19-5950',
 'TCGA-TM-A84F',
 'TCGA-27-2518',
 'TCGA-E1-A7Z6',
 'TCGA-06-1087',
 'TCGA-E1-A7Z4',
 'TCGA-DU-A6S8',
 'TCGA-HT-7483',
 'TCGA-RY-A845',
 'TCGA-02-0075',
 'TCGA-FG-A6IZ',
 'TCGA-DU-5872',
 'TCGA-HT-7467',
 'TCGA-HT-8114',
 'TCGA-S9-A6WG

In [21]:
train_labels

['astrocytoma',
 'astrocytoma',
 'oligodendroglioma',
 'glioblastoma',
 'astrocytoma',
 'glioblastoma',
 'glioblastoma',
 'astrocytoma',
 'glioblastoma',
 'astrocytoma',
 'glioblastoma',
 'astrocytoma',
 'oligodendroglioma',
 'oligodendroglioma',
 'oligodendroglioma',
 'oligodendroglioma',
 'astrocytoma',
 'oligodendroglioma',
 'oligodendroglioma',
 'astrocytoma',
 'astrocytoma',
 'astrocytoma',
 'oligodendroglioma',
 'astrocytoma',
 'oligodendroglioma',
 'glioblastoma',
 'oligodendroglioma',
 'glioblastoma',
 'astrocytoma',
 'astrocytoma',
 'glioblastoma',
 'oligodendroglioma',
 'glioblastoma',
 'oligodendroglioma',
 'astrocytoma',
 'astrocytoma',
 'glioblastoma',
 'astrocytoma',
 'astrocytoma',
 'glioblastoma',
 'astrocytoma',
 'oligodendroglioma',
 'glioblastoma',
 'astrocytoma',
 'glioblastoma',
 'astrocytoma',
 'glioblastoma',
 'astrocytoma',
 'glioblastoma',
 'astrocytoma',
 'oligodendroglioma',
 'astrocytoma',
 'astrocytoma',
 'glioblastoma',
 'oligodendroglioma',
 'astrocytoma'