In [None]:
import numpy as np
import math
import os
from tqdm.notebook import tqdm
import shutil
import pandas as pd
import glob
import xlsxwriter
import random
from matplotlib import pyplot as plt

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

## For generating the five test set configurations

In [None]:
df_clinical = pd.read_excel('/workspace/list_of_patients_with_clinical_information.xlsx', sheet_name=0)
df_clinical = df_clinical[['PET_ID', 'Sex', 'Symptom_Onset_Age']]

source_path = '/workspace/folder_containing_all_images_and_masks'
target_path = '/workspace/folder_to_split_images_and_masks_per_fold'

# Split into test and train using 5 folds
# there are two files (one for the image and another for the mask) per patient
total_list = os.listdir(source_path)
total_list = sorted(total_list)
total_list = total_list[0::2]

X = [s[:-11] for s in total_list]
Y = []

for i in total_list:
    if '_no_' in i: 
        Y.append(0)
    elif '_yes_' in i: 
        Y.append(1)
    else:
        print('I am a wanderer!')


X_sub = [x.split('_')[0]+'_'+x.split('_')[1] for x in X]


# Create a combined stratification array
clinical_dict = df_clinical.set_index('PET_ID').to_dict()
sex_dict = clinical_dict['Sex']
age_dict = clinical_dict['Symptom_Onset_Age']
sex_values = [sex_dict[pet_id] for pet_id in X_sub]
age_values = [age_dict[pet_id] for pet_id in X_sub]


# Bin the age values to reduce the number of unique age values
age_bins = pd.cut(age_values, bins=10, labels=False)
stratification_labels = [f"{label}_{sex}_{age}" for label, sex, age in zip(Y, sex_values, age_bins)]


# Perform the split
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
skf.get_n_splits(X, stratification_labels)

subj_list = os.listdir(source_path)

for i, (train_index, test_index) in enumerate(skf.split(X, stratification_labels)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")
    
    os.mkdir(os.path.join(target_path, f'Fold_{i}'))
    os.mkdir(os.path.join(target_path, f'Fold_{i}/train'))
    os.mkdir(os.path.join(target_path, f'Fold_{i}/test'))
    
    for j in train_index:
        for s in subj_list:
            if X[j] in s:
                shutil.copy(os.path.join(source_path, s), 
                            os.path.join(os.path.join(target_path, f'Fold_{i}/train', s)))
                
    for k in test_index:
        for s in subj_list:
            if X[k] in s:
                shutil.copy(os.path.join(source_path, s), 
                            os.path.join(os.path.join(target_path, f'Fold_{i}/test', s)))

                
## check if the splitting went well
for x in range(5):
    train_yes, train_no, test_yes, test_no = 0, 0, 0, 0
    
    train_list = os.listdir(os.path.join(target_path, f'Fold_{x}', 'train'))
    test_list = os.listdir(os.path.join(target_path, f'Fold_{x}', 'test'))
    
    for y in train_list:
        if '_no_' in y:
            train_no += 1
        elif '_yes_' in y:
            train_yes += 1
            
    for z in test_list:
        if '_no_' in z:
            test_no += 1
        elif '_yes_' in z:
            test_yes += 1
            
    print(f"Fold_{x}")
    print('train_yes: ', train_yes/2, ' train_no: ', train_no/2)
    print('valid_yes: ', test_yes/2, ' valid_no: ', test_no/2)

## For generating five-folds per test set, resulting in 5 x 5 = 25 folds

In [None]:
random_seed = random.sample(range(0, 1024), 5)
print('I am the random seeds: ', random_seed)

train_list = ['Fold_0', 'Fold_1', 'Fold_2', 'Fold_3', 'Fold_4']

## for each fold or subset from above, split them again into five folds for five-fold cross-validation for each test set
## please refer to Figure 6 from the published paper
for current_fold, current_seed in zip(train_list, random_seed):

    original_path = os.path.join(target_path, current_fold, 'train')
    total_list = os.listdir(original_path)
    total_list = sorted(total_list)
    total_list = total_list[0::2]

    X = [s[:-11] for s in total_list]
    Y = []

    for i in total_list:
        if '_no_' in i: 
            Y.append(0)
        elif '_yes_' in i: 
            Y.append(1)
        else:
            print('I am a wanderer!')

   
    
    X_sub = [x.split('_')[0]+'_'+x.split('_')[1] for x in X]

    # Create a combined stratification array
    clinical_dict = df_clinical.set_index('PET_ID').to_dict()
    sex_dict = clinical_dict['Sex']
    age_dict = clinical_dict['Symptom_Onset_Age']
    sex_values = [sex_dict[pet_id] for pet_id in X_sub]
    age_values = [age_dict[pet_id] for pet_id in X_sub]


    # Bin the age values to reduce the number of unique age values
    age_bins = pd.cut(age_values, bins=10, labels=False)
    stratification_labels = [f"{label}_{sex}_{age}" for label, sex, age in zip(Y, sex_values, age_bins)]

    # Perform the split
    skf = StratifiedKFold(n_splits=5, random_state=current_seed, shuffle=True)
    skf.get_n_splits(X, stratification_labels)
    print(skf)

   
    subj_list = os.listdir(original_path)

    for i, (train_index, valid_index) in enumerate(skf.split(X, stratification_labels)):
        print(f"Fold {i}:")
        print(f"  Train: index={train_index}")
        print(f"  Valid:  index={valid_index}")

        os.mkdir(os.path.join(target_path, current_fold, f'Fold_{i}'))
        os.mkdir(os.path.join(target_path, current_fold, f'Fold_{i}', 'train'))
        os.mkdir(os.path.join(target_path, current_fold, f'Fold_{i}', 'valid'))

        for j in train_index:
            for s in subj_list:
                if X[j] in s:
                    shutil.copy(os.path.join(original_path, s), 
                                os.path.join(target_path, current_fold, f'Fold_{i}', 'train', s))

        for k in valid_index:
            for s in subj_list:
                if X[k] in s:
                    shutil.copy(os.path.join(original_path, s), 
                                os.path.join(target_path, current_fold, f'Fold_{i}', 'valid', s))
    
    
    ## Let's check if everything went fine
    for x in range(5):
        train_yes, train_no, valid_yes, valid_no = 0, 0, 0, 0

        train_list = os.listdir(os.path.join(target_path, current_fold, f'Fold_{x}', 'train'))
        valid_list = os.listdir(os.path.join(target_path, current_fold, f'Fold_{x}', 'valid'))

        for y in train_list:
            if '_no_' in y:
                train_no += 1
            elif '_yes_' in y:
                train_yes += 1

        for z in valid_list:
            if '_no_' in z:
                valid_no += 1
            elif '_yes_' in z:
                valid_yes += 1

        print(f"Fold_{x}")
        print('train_yes: ', train_yes/2, ' train_no: ', train_no/2)
        print('valid_yes: ', valid_yes/2, ' valid_no: ', valid_no/2)
    
    

## For preparing the clinical variables data

In [None]:
## sex variable was binarized into 0 or 1
def getTF(x):
    if x=='M': return 0.0
    elif x=='F': return 1.0
    else: raise

In [None]:
df_cli = pd.read_excel('/workspace/list_of_patients_with_clinical_information.xlsx', sheet_name=0)

df_cli = df_cli[['PET_ID', 'Sex', 'Symptom_Onset_Age', 'Symptom_Onset_Age_60', 'Gap_in_3months', 'HY_rnd', 
                 'RT_RA', 'RT_LA', 'RT_RL', 'RT_LL', 'FT_R', 'FT_L', 'LA_R', 'LA_L', 'RG_RA', 'RG_LA']]

df_with_na = df_cli[df_cli.isna().any(axis=1)]

target_list = ['HY_rnd', 'RT_RA', 'RT_LA', 'RT_RL', 'RT_LL', 'FT_R', 'FT_L', 'LA_R', 'LA_L', 'RG_RA', 'RG_LA']

## impute the missing values with the median value
for var in target_list:
    df_cli[var] = df_cli[var].fillna(df_cli[var].median())

df_cli_filled_na = df_cli.iloc[df_with_na.index]
df_cli_filled_na = df_cli_filled_na[target_list] 


df_cli['Tremor_total'] = df_cli['RT_RA'] + df_cli['RT_LA'] + df_cli['RT_RL'] + df_cli['RT_LL']
df_cli['Brady_total'] = df_cli['FT_R'] + df_cli['FT_L'] + df_cli['LA_R'] + df_cli['LA_L']
df_cli['Rigid_total'] = df_cli['RG_RA'] + df_cli['RG_LA']


cont_list = ['Symptom_Onset_Age', 'Symptom_Onset_Age_60', 'Gap_in_3months', 'HY_rnd', 'Tremor_total', 'Brady_total', 'Rigid_total']
up_list = ['RT_RA', 'RT_LA', 'RT_RL', 'RT_LL', 'FT_R', 'FT_L', 'LA_R', 'LA_L', 'RG_RA', 'RG_LA']
cat_list = ['Sex']
id_list = ['PET_ID']

df_cli = df_cli.copy()

## sex to either 0 or 1
df_cli.loc[:,'Sex_Z'] = df_cli["Sex"].map(lambda x:getTF(x))

## updrs 0~4 --> 0~2
for var in up_list:
    new_col = var+'_Z'
    df_cli.loc[:, new_col] = df_cli[var].map(lambda x:x/2)
    

scaler = MinMaxScaler(feature_range=(0,2))
for var in cont_list:
    new_col = var+'_Z'
    df_cli.loc[:, new_col] = scaler.fit_transform(df_cli[[var]])

    
## drop the original columns and keep the normalized columns
df_z = df_cli.drop(['Sex', 'Symptom_Onset_Age', 'Gap_in_3months', 'HY_rnd', 'RT_RA', 'RT_LA', 'RT_RL', 'RT_LL', 'FT_R', 'FT_L', 'LA_R', 'LA_L', 'RG_RA', 'RG_LA', 'Symptom_Onset_Age_60', 'Tremor_total', 'Brady_total', 'Rigid_total'], axis=1)
display(df_z)

## For generating corresponding excel files with clinical variables for each data configuration

In [None]:
target_path = '/workspace/folder_to_split_images_and_masks_per_fold'

fold_list = ['Fold_0', 'Fold_1', 'Fold_2', 'Fold_3', 'Fold_4']

for mother_fold in fold_list:
    
    os.mkdir(os.path.join(target_path, mother_fold, 'clinical_var'))


    ## for each fold
    for daughter_fold in fold_list:
        path_train_list = glob.glob( os.path.join(target_path, mother_fold, f'{daughter_fold}', 'train', '*_img.nii.gz') )
        path_valid_list = glob.glob( os.path.join(target_path, mother_fold, f'{daughter_fold}', 'valid', '*_img.nii.gz') )

        os.mkdir(os.path.join(target_path, mother_fold, 'clinical_var', f'{daughter_fold}'))

        ## train
        img_train_list = [os.path.basename(x)[:-11] for x in path_train_list]
        img_train_list = [x.split('_')[0]+'_'+x.split('_')[1] for x in img_train_list]
        df_train = df_z[df_z['PET_ID'].isin(img_train_list)]
        df_train.to_excel(os.path.join(target_path, mother_fold, 'clinical_var', f'{daughter_fold}', 'train.xlsx'), index=False)

        ## valid
        img_valid_list = [os.path.basename(x)[:-11] for x in path_valid_list]
        img_valid_list = [x.split('_')[0]+'_'+x.split('_')[1] for x in img_valid_list]
        df_valid = df_z[df_z['PET_ID'].isin(img_valid_list)]
        df_valid.to_excel(os.path.join(target_path, mother_fold, 'clinical_var', f'{daughter_fold}', 'valid.xlsx'), index=False)


    ## for the test set
    path_test_list = glob.glob( os.path.join(target_path, mother_fold, 'test', '*_img.nii.gz') )
    os.mkdir(os.path.join(target_path, mother_fold, 'clinical_var', 'test'))

    img_test_list = [os.path.basename(x)[:-11] for x in path_test_list]
    img_test_list = [x.split('_')[0]+'_'+x.split('_')[1] for x in img_test_list]
    df_test = df_z[df_z['PET_ID'].isin(img_test_list)]
    df_test.to_excel(os.path.join(target_path, mother_fold, 'clinical_var', 'test', 'test.xlsx'), index=False)