In [1]:
import pathlib
from natsort import natsorted
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt

### 1. Get HPO-annotated image names (N=171)

In [2]:
hpo_terms_from_csv_files = []
image_names = []
for syndrome in ['22q11DS', 'Angelman', 'KS', 'NS', 'WS']:
    df = pd.read_csv(pathlib.Path(pathlib.Path.cwd(), 'metadata', 'annotated-hpo-terms', '%s.csv'%syndrome))
    hpo_terms_from_csv_files += df.keys()[2:-1].to_list()
    image_names += df['image_name'].to_list()

df = pd.read_csv(pathlib.Path(pathlib.Path.cwd(), 'metadata', 'facedet.csv'))

print('I do not have these images in the database:')
missing_images = [i for i in image_names if i not in df['image_name'].to_list()]
print(missing_images)

I do not have these images in the database:
['22q11DSSlide150.png', 'KSSlide133.png', 'NSSlide6.png', 'WSSlide316.png']


### 2. Seperate these samples from the main data frame

In [3]:
testset_images = [image_name for image_name in image_names if image_name in df['image_name'].to_list()]
testset_indices = [idx for idx, image_name in enumerate(df['image_name'].to_list()) if image_name in testset_images]

print('==> ', df.shape)

# HPO-annotated 171 images
df_testset = df.iloc[testset_indices,:].reset_index(drop=True)
n_testset = df_testset.shape[0]

df = df.drop(index=testset_indices).reset_index(drop=True)
print('==> ', df.shape)

==>  (3544, 16)
==>  (3373, 16)


### 3. Get names of related images (any kinship relation--to be used to create partitions in cross-validation)

In [4]:
labels_txt = [f.split('Slide')[0] for f in df['image_name']]
categories = np.unique(labels_txt)
labels = np.array([np.argwhere(categories==labels_txt[i]).squeeze() for i in range(0, len(labels_txt))])
image_names = np.array([f.replace('.png','') for f in df['image_name']])

file1 = open(pathlib.Path(pathlib.Path.cwd(), 'metadata','related_samples.csv').as_posix(), 'r')
lines = file1.readlines()
related_samples = []
for i in range(1, len(lines)):
    related_samples.append(lines[i].replace('\n','').replace(' ','').replace('\'','').split(','))  


# image_names
# labels
# groups
groups = -np.ones(shape=(len(image_names),))

for idx, sample in enumerate(related_samples):
    
    for item in sample:
        #print(idx, sample, item)
        ii = np.argwhere(image_names==item)
        groups[ii] = idx
        
j = np.max(groups) + 1
for idx in range(0, groups.shape[0]):
    if groups[idx]==-1:
        groups[idx] = j
        j += 1

### 4. Create 5-folds (StratifiedGroupKFold)

In [5]:
from sklearn.model_selection import StratifiedGroupKFold
sgkf = StratifiedGroupKFold(n_splits=5, random_state=42, shuffle=True)
sgkf.get_n_splits(image_names, labels, groups)

folds = []

for i, (train_index, test_index) in enumerate(sgkf.split(image_names, labels, groups)):
    
    print(i, len(train_index), len(test_index))
    print( [round(sum(labels[train_index]==i)/labels[train_index].size, 3) for i in range(0, 11)] )
    print( [round(sum(labels[test_index]==i)/labels[test_index].size, 3) for i in range(0, 11)] )
    print('\n')
    
    fold = []
    for j in range(0, image_names.shape[0]):
        if j in list(train_index):
            fold.append('train')
        elif j in list(test_index):
            fold.append('val')

    folds.append(fold)
    
df = pd.concat([df, pd.DataFrame({'fold-1':folds[0], 'fold-2':folds[1], 'fold-3':folds[2], 'fold-4':folds[3], 'fold-5':folds[4]})] , axis=1)

0 2698 675
[0.166, 0.125, 0.091, 0.036, 0.105, 0.063, 0.086, 0.031, 0.032, 0.069, 0.052]
[0.161, 0.124, 0.093, 0.034, 0.104, 0.062, 0.089, 0.031, 0.03, 0.064, 0.056]


1 2701 672
[0.165, 0.126, 0.091, 0.036, 0.104, 0.063, 0.087, 0.031, 0.031, 0.067, 0.053]
[0.164, 0.121, 0.094, 0.036, 0.106, 0.061, 0.088, 0.03, 0.031, 0.071, 0.051]


2 2696 677
[0.165, 0.124, 0.091, 0.036, 0.104, 0.063, 0.088, 0.031, 0.032, 0.068, 0.052]
[0.165, 0.126, 0.093, 0.035, 0.106, 0.064, 0.084, 0.031, 0.028, 0.068, 0.056]


3 2701 672
[0.164, 0.124, 0.092, 0.036, 0.104, 0.063, 0.087, 0.031, 0.031, 0.068, 0.053]
[0.168, 0.126, 0.089, 0.034, 0.104, 0.062, 0.085, 0.031, 0.033, 0.067, 0.051]


4 2696 677
[0.165, 0.124, 0.092, 0.035, 0.105, 0.062, 0.086, 0.031, 0.03, 0.068, 0.053]
[0.167, 0.126, 0.087, 0.038, 0.102, 0.065, 0.089, 0.031, 0.034, 0.068, 0.05]




### 5 .Combine both data frames

In [6]:
df_testset = pd.concat([df_testset, pd.DataFrame({'fold-1':n_testset*['test'], 
                                                  'fold-2':n_testset*['test'], 
                                                  'fold-3':n_testset*['test'], 
                                                  'fold-4':n_testset*['test'], 
                                                  'fold-5':n_testset*['test']})] , axis=1)

df = pd.concat([df, df_testset], axis=0).reset_index(drop=True)
df.to_csv(pathlib.Path(pathlib.Path.cwd(), 'metadata', 'partitions.csv'), index=False)

### 6. Check class distribution across folds (in stratified cross validation, we should preserve the original class distribution)

In [7]:
categories = ['22q11DS', 'Angelman', 'BWS', 'CdLS', 'Down', 'KS', 'NS', 'PWS', 'RSTS1', 'Unaffected', 'WHS', 'WS']
print(categories)

for fold in ['fold-1','fold-2','fold-3','fold-4','fold-5']:
    df = pd.read_csv(pathlib.Path(pathlib.Path.cwd(), 'metadata','partitions.csv'))
    df = df[df[fold]=='train'].reset_index(drop=True)
    
    labels = [f.split('Slide')[0] for f in list(df['image_name']) ]
    labels = np.array([np.argwhere(np.array(categories)==f).squeeze() for f in labels])
    print([round(np.sum(labels==i)/len(labels), 3) for i in range(0, 12)], '\t', len(labels))

df = pd.read_csv(pathlib.Path(pathlib.Path.cwd(), 'metadata','partitions.csv'))
labels = [f.split('Slide')[0] for f in list(df['image_name']) ]
labels = np.array([np.argwhere(np.array(categories)==f).squeeze() for f in labels])
print('')
print([round(np.sum(labels==i)/len(labels), 3) for i in range(0, 12)], '\t', len(labels))


df = pd.read_csv(pathlib.Path(pathlib.Path.cwd(), 'metadata','partitions.csv'))
df = df[df['fold-1']=='test'].reset_index(drop=True)
labels = [f.split('Slide')[0] for f in list(df['image_name']) ]
labels = np.array([np.argwhere(np.array(categories)==f).squeeze() for f in labels])
print('')
print([round(np.sum(labels==i)/len(labels), 3) for i in range(0, 12)], '\t', len(labels))

['22q11DS', 'Angelman', 'BWS', 'CdLS', 'Down', 'KS', 'NS', 'PWS', 'RSTS1', 'Unaffected', 'WHS', 'WS']
[0.166, 0.125, 0.091, 0.036, 0.105, 0.063, 0.086, 0.031, 0.032, 0.069, 0.052, 0.146] 	 2698
[0.165, 0.126, 0.091, 0.036, 0.104, 0.063, 0.087, 0.031, 0.031, 0.067, 0.053, 0.147] 	 2701
[0.165, 0.124, 0.091, 0.036, 0.104, 0.063, 0.088, 0.031, 0.032, 0.068, 0.052, 0.148] 	 2696
[0.164, 0.124, 0.092, 0.036, 0.104, 0.063, 0.087, 0.031, 0.031, 0.068, 0.053, 0.147] 	 2701
[0.165, 0.124, 0.092, 0.035, 0.105, 0.062, 0.086, 0.031, 0.03, 0.068, 0.053, 0.148] 	 2696

[0.167, 0.129, 0.087, 0.034, 0.099, 0.069, 0.092, 0.029, 0.03, 0.064, 0.05, 0.149] 	 3544

[0.199, 0.211, 0.0, 0.0, 0.0, 0.199, 0.199, 0.0, 0.0, 0.0, 0.0, 0.193] 	 171
