In [1]:
import pandas as pd
import numpy as np
import pickle
import copy
import math
import os

In [2]:
def store_data(data, file_path):
    with open(file_path, 'wb') as file:
        pickle.dump(data, file)

## Not doing cross validation
def get_frac_split(meta_df, matching_field, ind_column, num_folds=0):
    # Copy dataframe.
    df = meta_df.copy(deep=True)

    # Get unique classes.
    unique_classes = np.unique(meta_df[ind_column])
    # randomize rows
    df = df.sample(frac=1).reset_index(drop=True)

    folds          = dict()
    for i in range(num_folds):
        folds[i] = dict()
        folds[i]['train'] = list()
        folds[i]['test']  = list()

    for class_ in unique_classes:
        # Get slides for class.
        slides      = np.unique(df[df[ind_column]==class_][matching_field].values)

        # Test size.
        num_samples = len(slides)
        test_size   = math.floor(num_samples*(1/num_folds))

        # Iterate through chunks and add samples to fold.
        for i in range(num_folds):
            test_sample  = slides[i*test_size:(i+1)*test_size]
            train_sample = list(set(slides).difference(set(test_sample)))
            folds[i]['train'].extend(train_sample)
            folds[i]['test'].extend(test_sample)

    return folds

def get_folds(meta_df, matching_field, ind_column, num_folds=0, valid_set=False):

    # Get initial split for train/test.
    folds = get_frac_split(meta_df, matching_field, ind_column, num_folds=num_folds)

    for i in range(num_folds):
        whole_train_samples = folds[i]['train']
        subset_df = meta_df[meta_df[matching_field].isin(whole_train_samples)]
        train_val_folds = get_frac_split(subset_df, matching_field, ind_column, num_folds=num_folds)
        del folds[i]['train']
        folds[i]['train'] = train_val_folds[0]['train']
        folds[i]['valid'] = train_val_folds[0]['test']

    return folds

# Verify: This should all be empty.
def sanity_check_overlap(folds, num_folds):
    # For each fold, no overlap between cells.
    for i in range(num_folds):
        result = set(folds[i]['train']).intersection(set(folds[i]['valid']))
        if len(result) > 0:
            print(result)

        result = set(folds[i]['train']).intersection(set(folds[i]['test']))
        if len(result) > 0:
            print(result)

        result = set(folds[i]['valid']).intersection(set(folds[i]['test']))
        if len(result) > 0:
            print(result)

        # No overlap between test sets of all folds.
        for i in range(num_folds):
            for j in range(num_folds):
                if i==j: continue
                result = set(folds[i]['test']).intersection(set(folds[j]['test']))
                if len(result) > 0:
                    print('Fold %s-%s' % (i,j), result)

# Fit for legacy code.
def fit_format(folds):
    slides_folds = dict()
    for i, fold in enumerate(folds):
        slides_folds[i] = dict()
        slides_folds[i]['train'] = [(slide, None, None) for slide in folds[i]['train']]
        slides_folds[i]['valid'] = [(slide, None, None) for slide in folds[i]['valid']]
        slides_folds[i]['test']  = [(slide, None, None) for slide in folds[i]['test']]

    return slides_folds


### Modified JM Code ###

In [30]:

meta_csv    = '/gpfs/data/pmedlab/Users/mottej02/dl_project/pipeline/Histomorphological-Phenotype-Learning/metadata/LUADLUSC_lungsubtype_overall_survival.csv'
pickle_path = '/gpfs/data/pmedlab/Users/mottej02/dl_project/pipeline/Histomorphological-Phenotype-Learning/results/VICReg_0/TCGAFFPE_LUADLUSC_5x_60pc_250K_fold_file.pkl'

# Read meta data file, rename column.
#meta_df  = pd.read_csv(meta_csv)
#print(meta_df.columns)

#cancer_type = meta_df['luad'].values
#del meta_df['luad']
#meta_df['luad'] = cancer_type

# Create mapping for cancer types and integers.
#mapping_cancers = dict(zip(np.unique(cancer_type), range(len(np.unique(cancer_type)))))

# Map new columns for integer indicator.
#meta_df['cancer_type_ind'] = meta_df['luad'].astype(str).map(mapping_cancers)

#print(meta_df.columns)

In [31]:
# Read meta data file
meta_df = pd.read_csv(meta_csv)
print(meta_df.columns)

#meta_df.drop('cancer_type_ind', axis=1, inplace=True)
#print(meta_df.columns)

# Simply create a new column with string version of 'luad'
meta_df['cancer_type_ind'] = meta_df['luad'].astype(int)

Index(['slides', 'participants', 'luad', 'os_event_ind', 'os_event_data'], dtype='object')


In [32]:
meta_df.head(100)

Unnamed: 0,slides,participants,luad,os_event_ind,os_event_data,cancer_type_ind
0,TCGA-05-4245-01Z-00-DX1,TCGA-05-4245,1,0.0,24.000000,1
1,TCGA-05-4250-01Z-00-DX1,TCGA-05-4250,1,1.0,3.978082,1
2,TCGA-05-4382-01Z-00-DX1,TCGA-05-4382,1,0.0,19.956164,1
3,TCGA-05-4396-01Z-00-DX1,TCGA-05-4396,1,1.0,9.961644,1
4,TCGA-05-4398-01Z-00-DX1,TCGA-05-4398,1,0.0,47.046575,1
...,...,...,...,...,...,...
95,TCGA-49-4494-01Z-00-DX1,TCGA-49-4494,1,1.0,35.539726,1
96,TCGA-49-4494-01Z-00-DX2,TCGA-49-4494,1,1.0,35.539726,1
97,TCGA-49-4494-01Z-00-DX3,TCGA-49-4494,1,1.0,35.539726,1
98,TCGA-49-4494-01Z-00-DX4,TCGA-49-4494,1,1.0,35.539726,1


In [34]:
missing_count = meta_df['luad'].isna().sum()
print(f"Number of missing values in 'luad': {missing_count}")

print(meta_df[['luad','cancer_type_ind']].head())
print("Mapping dictionary:", mapping_cancers)

Number of missing values in 'luad': 0
   luad  cancer_type_ind
0     1                1
1     1                1
2     1                1
3     1                1
4     1                1
Mapping dictionary: {0: 0, 1: 1}


In [37]:
folds       = get_folds(meta_df, matching_field='slides', ind_column='cancer_type_ind', num_folds=1, valid_set=False)
final_folds = fit_format(folds)

# If no output, all good.
sanity_check_overlap(folds, num_folds=0)

store_data(final_folds, pickle_path)