In [89]:
import pandas as pd
import json
from sklearn.model_selection import StratifiedKFold, train_test_split

# Loading Tharun Thompson Data:

In [90]:
df_basic = pd.read_csv('data/classification.csv')
df_basic.head()

Unnamed: 0,patient,diagnose,Slices,Resolution (20x/40x),classification
0,1,PTC,10,40,PTC-like
1,2,NIFTP,2,40,PTC-like
2,3,FA,10,40,non-PTC-like
3,4,PTC,10,40,PTC-like
4,5,FA,10,40,non-PTC-like


In [91]:
tt_df = pd.read_csv('data/tt_class_detailed.csv')
tt_df.head()

Unnamed: 0,patient,diagnose,Slices,Resolution (20x/40x),classification,img_lbl,bi_class
0,1,PTC,10,40,PTC-like,1a.jpeg,1.0
1,1,PTC,10,40,PTC-like,1b.jpeg,1.0
2,1,PTC,10,40,PTC-like,1c.jpeg,1.0
3,1,PTC,10,40,PTC-like,1d.jpeg,1.0
4,1,PTC,10,40,PTC-like,1e.jpeg,1.0


# Stratified K-Fold:

In [92]:
seed = 42

In [114]:
outer_split = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

for outer_split_idx, (train_val_index, test_index) in enumerate(outer_split.split(df_basic.patient, df_basic.diagnose)):
    print(f"Outer split Number: {outer_split_idx}")

    # Return patient numbers of indices selected:
    test_patients = df_basic[df_basic.index.isin(test_index)].patient.tolist()
    #print("Test Patient No.: ", len(test_patients))

    # Extract the filenames of the patients in the test set:
    test_fnames = [z for zs in [[y for y in tt_df[tt_df['patient']==x].img_lbl] for x in test_patients] for z in zs]

    # Adjusting dataframe to exclude those set aside for the test set:
    tv_patients = df_basic[df_basic.index.isin(train_val_index)].reset_index(drop=True)
    #print("Length TV Patient DF: ", len(tv_patients))

    # Dividing the train/val 80% split further into 60% train, 20% validation (using n_splits = 4):
    inner_split = StratifiedKFold(n_splits=4, shuffle=True, random_state=seed)
    inner_counter = 0
    
    for inner_split_idx, (train_index, val_index) in enumerate(inner_split.split(tv_patients.patient, tv_patients.diagnose)):
        print(f"---- Inner split Number: {inner_split_idx}")
        
        # Return patient numbers of indices selected:
        train_patients = tv_patients[tv_patients.index.isin(train_index)]
        #print("Train Patient No.: ", len(train_patients))
        val_patints = tv_patients[tv_patients.index.isin(val_index)].patient.tolist()
        #print("Val Patient No.: ", len(val_patints))

        # Extract filenames for training and validation sets:
        train_fnames = [z for zs in [[y for y in tt_df[tt_df['patient']==x].img_lbl] for x in train_patients] for z in zs]
        val_fnames = [z for zs in [[y for y in tt_df[tt_df['patient']==x].img_lbl] for x in val_patints] for z in zs]

        # Save splits to .json file:
#         save_loc = "data/cv_splits/cv_split_%s_%s.json" % (outer_split_idx, inner_split_idx)
#         split_patients = {"train":train_fnames, "val": val_fnames, "test": test_fnames}

#         with open(save_loc, 'w', encoding="utf-8") as outfile:
#            json.dump(split_patients, outfile, ensure_ascii=False, indent=2)
        

Outer split Number: 0
---- Inner split Number: 0
---- Inner split Number: 1
---- Inner split Number: 2
---- Inner split Number: 3
Outer split Number: 1
---- Inner split Number: 0
---- Inner split Number: 1
---- Inner split Number: 2
---- Inner split Number: 3
Outer split Number: 2
---- Inner split Number: 0
---- Inner split Number: 1
---- Inner split Number: 2
---- Inner split Number: 3
Outer split Number: 3
---- Inner split Number: 0
---- Inner split Number: 1
---- Inner split Number: 2
---- Inner split Number: 3
Outer split Number: 4
---- Inner split Number: 0
---- Inner split Number: 1
---- Inner split Number: 2
---- Inner split Number: 3


# Extracting Full Training and Validation Set:
- For full training of the model we extract an 80% training and 20% validation set.

In [98]:
data_split = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

for split_idx, (train_index, val_index) in enumerate(data_split.split(df_basic.patient, df_basic.diagnose)):
    print(f"Outer split Number: {split_idx}")

    # Return patient numbers of indices selected:
    train_patients = df_basic[df_basic.index.isin(train_index)].patient.tolist()
    print("---- Train Patients: ", len(train_patients))
    val_patients = df_basic[df_basic.index.isin(val_index)].patient.tolist()
    print("---- Validation Patients: ", len(val_patients))
    
    # Extract the filenames of the patients in the test set:
    train_fnames = [z for zs in [[y for y in tt_df[tt_df['patient']==x].img_lbl] for x in train_patients] for z in zs]
    val_fnames = [z for zs in [[y for y in tt_df[tt_df['patient']==x].img_lbl] for x in val_patients] for z in zs]
    
    # Save splits to .json file:
#     save_loc = "data/full_data_splits/data_split_%s.json" % (split_idx)
#     split_patients = {"train":train_fnames, "val": val_fnames}

#     with open(save_loc, 'w', encoding="utf-8") as outfile:
#        json.dump(split_patients, outfile, ensure_ascii=False, indent=2)

Outer split Number: 0
---- Train Patients:  124
---- Validation Patients:  32
Outer split Number: 1
---- Train Patients:  125
---- Validation Patients:  31
Outer split Number: 2
---- Train Patients:  125
---- Validation Patients:  31
Outer split Number: 3
---- Train Patients:  125
---- Validation Patients:  31
Outer split Number: 4
---- Train Patients:  125
---- Validation Patients:  31
