In [1]:
import dill

name_to_id = dill.load(open("./../annotation/name_to_id.dill","rb"))
id_to_name = dill.load(open("./../annotation/id_to_name.dill","rb"))

from sklearn.model_selection import StratifiedGroupKFold
import random
import numpy as np
import pyarrow.parquet as pq
import pandas as pd

# Load the data using pyarrow for faster reading
file_path = './../data/GEO/preprocessed/training_Mvalues.parquet'
table = pq.read_table(file_path)

# Convert to pandas DataFrame if needed
Mv = table.to_pandas().set_index('probe')
Mv.columns = Mv.columns.str.split('_').str[0]
meta = pd.read_csv('./../annotation/training_meta.csv', header=0, index_col='Sample')

random.seed(9)
groups = meta['Dataset'].unique()
random.shuffle(groups)

meta = meta.reset_index().rename(columns={'index':'Sample'}).set_index("Dataset").loc[groups].reset_index().set_index("Sample")
Mv = Mv.T.loc[meta.index]

print(Mv.shape, meta.shape)

with open(f'./../data/GEO/preprocessed/training.dill', 'wb') as f:
    dill.dump([Mv, meta], f)

(10351, 297598) (10351, 10)


In [2]:
excretory_meta = meta[meta['training.ID'].isin(name_to_id['excretory system'])]
excretory_Mv = Mv.loc[excretory_meta.index]
print(excretory_Mv.shape, excretory_meta.shape)

sensory_meta = meta[meta['training.ID'].isin(name_to_id['sensory system'])]
sensory_Mv = Mv.loc[sensory_meta.index]
print(sensory_Mv.shape, sensory_meta.shape)

cardio_meta = meta[meta['training.ID'].isin(name_to_id['cardiovascular system'])]
cardio_Mv = Mv.loc[cardio_meta.index]
print(cardio_Mv.shape, cardio_meta.shape)

nervous_meta = meta[meta['training.ID'].isin(name_to_id['nervous system'])]
nervous_Mv = Mv.loc[nervous_meta.index]
print(nervous_Mv.shape, nervous_meta.shape)

(6, 297598) (6, 10)
(4, 297598) (4, 10)
(10, 297598) (10, 10)
(42, 297598) (42, 10)


In [3]:
Mv = Mv[~Mv.index.isin(excretory_Mv.index)]
meta = meta[~meta.index.isin(excretory_meta.index)]

Mv = Mv[~Mv.index.isin(sensory_Mv.index)]
meta = meta[~meta.index.isin(sensory_meta.index)]

# sgkf = StratifiedGroupKFold(n_splits=3, shuffle=True, random_state=9) # if shuffle=True, then some folds with no training for some labels
sgkf = StratifiedGroupKFold(n_splits=3, shuffle=False, random_state=None)
sgkf.get_n_splits(Mv, meta['training.ID'], meta['Dataset'])
print(sgkf)

StratifiedGroupKFold(n_splits=3, random_state=None, shuffle=False)


In [4]:
fold_Mvs = dict()
fold_selectors = dict()

for i, (train_index, test_index) in enumerate(sgkf.split(Mv, meta['training.ID'], meta['Dataset'])):
    print(f"fold {i}:")
    print(f"\ttrain: len={len(train_index)}, groups={len(meta['Dataset'][train_index].unique())}")
    print(f"\tvalidation:  len={len(test_index)}, groups={len(meta['Dataset'][test_index].unique())}")
    
    rest_Mv = Mv.iloc[train_index]
    rest_meta = meta.iloc[train_index]
    holdout_Mv = Mv.iloc[test_index]
    holdout_meta = meta.iloc[test_index]
    
    print(f"\ttrain and validation shapes: {rest_Mv.shape, holdout_Mv.shape}")
    
    fold_Mvs[i] = [rest_Mv, rest_meta, holdout_Mv, holdout_meta]
    
print('\n...print if any overlap GSE between training and validation...')
for fold, fold_data in fold_Mvs.items():
    print(f"fold{fold}: {set(fold_data[1]['Dataset']).intersection(fold_data[3]['Dataset'])}")

fold 0:
	train: len=6778, groups=140
	validation:  len=3563, groups=68
	train and validation shapes: ((6778, 297598), (3563, 297598))
fold 1:
	train: len=6980, groups=143
	validation:  len=3361, groups=65
	train and validation shapes: ((6980, 297598), (3361, 297598))
fold 2:
	train: len=6924, groups=133
	validation:  len=3417, groups=75
	train and validation shapes: ((6924, 297598), (3417, 297598))

...print if any overlap GSE between training and validation...
fold0: set()
fold1: set()
fold2: set()


In [7]:
print('print cardio series distribution across folds, add excretory system to that with one')
for fold, fold_data in fold_Mvs.items():
    print(f"fold{fold}: training {fold_data[1][fold_data[1]['training.ID'].isin(name_to_id['cardiovascular system'])]['Dataset'].nunique()}")
    print(f"fold{fold}: holdout {fold_data[3][fold_data[3]['training.ID'].isin(name_to_id['cardiovascular system'])]['Dataset'].nunique()}")

print cardio series distribution across folds, add excretory system to that with one
fold0: training 2
fold0: holdout 0
fold1: training 1
fold1: holdout 1
fold2: training 1
fold2: holdout 1


In [9]:
print('print kidney series distribution across folds, add excretory system to that with one')
for fold, fold_data in fold_Mvs.items():
    print(f"fold{fold} {fold_data[1][fold_data[1]['training.ID'].isin(name_to_id['kidney'])]['Dataset'].nunique()}")
    print(f"fold{fold} {fold_data[3][fold_data[3]['training.ID'].isin(name_to_id['kidney'])]['Dataset'].nunique()}")

print kidney series distribution across folds, add excretory system to that with one
fold0 2
fold0 2
fold1 3
fold1 1
fold2 3
fold2 1


In [10]:
fold_Mvs[0][0] = pd.concat([fold_Mvs[0][0], excretory_Mv])
fold_Mvs[0][1] = pd.concat([fold_Mvs[0][1], excretory_meta])
fold_Mvs[1][0] = pd.concat([fold_Mvs[1][0], excretory_Mv])
fold_Mvs[1][1] = pd.concat([fold_Mvs[1][1], excretory_meta])
fold_Mvs[2][2] = pd.concat([fold_Mvs[2][2], excretory_Mv])
fold_Mvs[2][3] = pd.concat([fold_Mvs[2][3], excretory_meta])

In [11]:
for i, [rest_Mv, rest_meta, holdout_Mv, holdout_meta] in fold_Mvs.items():
    print(f"fold {i}:")
    print(f"\ttrain and validation shapes: {rest_Mv.shape, holdout_Mv.shape}")
    print(f"\ttotal number of samples in fold: {rest_Mv.shape[0]+holdout_Mv.shape[0]}" )

fold 0:
	train and validation shapes: ((6784, 297598), (3563, 297598))
	total number of samples in fold: 10347
fold 1:
	train and validation shapes: ((6986, 297598), (3361, 297598))
	total number of samples in fold: 10347
fold 2:
	train and validation shapes: ((6924, 297598), (3423, 297598))
	total number of samples in fold: 10347


In [12]:
print('print nasal cavity epithelium series distribution across folds, add sensory system to that with one')
for fold, fold_data in fold_Mvs.items():
    print(f"fold{fold}: {fold_data[1][fold_data[1]['training.ID'].isin(name_to_id['nasal cavity epithelium'])]['Dataset'].nunique()}")
    print(f"fold{fold}: {fold_data[3][fold_data[3]['training.ID'].isin(name_to_id['nasal cavity epithelium'])]['Dataset'].nunique()}")

print nasal cavity epithelium series distribution across folds, add sensory system to that with one
fold0: 2
fold0: 2
fold1: 3
fold1: 1
fold2: 3
fold2: 1


In [13]:
fold_Mvs[0][0] = pd.concat([fold_Mvs[0][0], sensory_Mv])
fold_Mvs[0][1] = pd.concat([fold_Mvs[0][1], sensory_meta])
fold_Mvs[1][0] = pd.concat([fold_Mvs[1][0], sensory_Mv])
fold_Mvs[1][1] = pd.concat([fold_Mvs[1][1], sensory_meta])
fold_Mvs[2][2] = pd.concat([fold_Mvs[2][2], sensory_Mv])
fold_Mvs[2][3] = pd.concat([fold_Mvs[2][3], sensory_meta])

In [15]:
for i, [rest_Mv, rest_meta, holdout_Mv, holdout_meta] in fold_Mvs.items():
    print(f"fold {i}:")
    print(f"\ttrain and validation shapes: {rest_Mv.shape, holdout_Mv.shape}")
    print(f"\ttotal number of samples in fold: {rest_Mv.shape[0]+holdout_Mv.shape[0]}" )

fold 0:
	train and validation shapes: ((6788, 297598), (3563, 297598))
	total number of samples in fold: 10351
fold 1:
	train and validation shapes: ((6990, 297598), (3361, 297598))
	total number of samples in fold: 10351
fold 2:
	train and validation shapes: ((6924, 297598), (3427, 297598))
	total number of samples in fold: 10351


In [16]:
all_holdout = pd.concat([fold_Mvs[0][3], fold_Mvs[1][3], fold_Mvs[2][3]])
print(f"total number of holdout: {all_holdout.shape[0]}")
print(f"duplicates?: {all_holdout[all_holdout.index.duplicated()]}")

total number of holdout: 10351
duplicates?: Empty DataFrame
Columns: [Dataset, Unnamed: 0, Annotated.tissue, UBERON.ID, UBERON.Name, Display.Name, merged.ID, training.ID, File, FileSeries]
Index: []


In [17]:
with open(f'./../data/GEO/preprocessed/training_folds.dill', 'wb') as f:
    dill.dump(fold_Mvs, f)