Data balancing

In [2]:
import numpy as np
import random
import pandas as pd
import os
import random
import logging
import torch
from monai import transforms
from monai.data import Dataset, DataLoader
from monai.config import print_config
import nibabel as nib
from IPython.display import clear_output
from monai.data import MetaTensor
from monai.utils import set_determinism
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from monai.transforms import Compose, RandAxisFlip, RandShiftIntensity, OneOf, EnsureChannelFirst, Orientation, Spacing,  RandRotate, RandFlip, ToTensor, SpatialPad, ScaleIntensity, CropForeground,Resize, NormalizeIntensity, RandScaleIntensity


1. Random undersampling

In [3]:
# Undersample data to the second max label
# applied at the beginning of the algorithm
def undersample_data(dataset, features, verbose):
    labels = dataset['label'].value_counts()
    second_max_label = labels.nlargest(2).iloc[-1]
    rusampler = RandomUnderSampler(sampling_strategy={'CN':second_max_label}, random_state=12)

    X = dataset[features]  
    y = dataset['label']  
    X_resampled, y_resampled = rusampler.fit_resample(X, y)

    df_undersampled = pd.DataFrame(X_resampled, columns=features)
    df_undersampled['label'] = y_resampled

    if verbose==True:
        print("Original label distribution:")
        print(dataset['label'].value_counts())

        print("\nResampled label distribution:")
        print(df_undersampled['label'].value_counts())
    
    df_undersampled.reset_index(drop=True, inplace=True)
    return df_undersampled

# udnersample applied in the end to balance final classes
def undersample_final_dataset(dataset, features, verbose):
    rusampler = RandomUnderSampler(sampling_strategy='all', random_state=12)
    X = dataset[features]  
    y = dataset['label']  
    X_resampled, y_resampled = rusampler.fit_resample(X, y)
    df_undersampled = pd.DataFrame(X_resampled, columns=features)
    df_undersampled['label'] = y_resampled
    if verbose==True:
        print("Original label distribution:")
        print(dataset['label'].value_counts())
        print("\nResampled label distribution:")
        print(df_undersampled['label'].value_counts())
    df_undersampled.reset_index(drop=True, inplace=True)
    return df_undersampled

In [4]:
import os, re

# Create a data dictionary with the scan_type considered (T1w, T2w) and the associated label
# take the last scan if more than one scan exists for each scan_type 
# this function is used to get only one scan_type path. To obtain both the scan paths refer to get_both_scans()

def get_scan_dictionary(dataset, scan_type, verbose):

    days_regex = r"d(\d{4})"
    path_o3 = 'C:/Users/Vito/Desktop/Magistrale/dataset tesi/OASIS3/oasis_3/mri'
    path_o4 = 'C:/Users/Vito/Desktop/Magistrale/dataset tesi/OASIS4/oasis_4/data'
    t1w_path = '' 
    t1w_file = ''
    folders_not_founded=[]
    data_dict = []
    path = ''
    y=0

    for i in range(0, len(dataset)):

        # check if the patients belongs to OASIS3 or OASIS4
        dataset_type = (re.match(r'OAS(\d)', dataset['OASISID'][i])).group(1)
        if dataset_type == '3':
            path = path_o3
        elif dataset_type == '4':
            path = path_o4
        
        # folder of the scans
        full_path = os.path.join(path, dataset['folder_scan'][i])

        if os.path.exists(full_path):

            day_match = re.search(days_regex, dataset['folder_scan'][i])
            d = day_match.group(1) if day_match else None

            if d:         
                tw_files = [f'sub-{dataset["OASISID"][i]}_ses-d{d}_{scan_type}.nii.gz',
                            f'sub-{dataset["OASISID"][i]}_sess-d{d}_{scan_type}.nii.gz',
                            f'sub-{dataset["OASISID"][i]}_ses-d{d}_echo-2_{scan_type}.nii.gz',
                            f'sub-{dataset["OASISID"][i]}_ses-d{d}_echo-1_{scan_type}.nii.gz',

                            f'sub-{dataset["OASISID"][i]}_sess-d{d}_echo-2_{scan_type}.nii.gz',
                            f'sub-{dataset["OASISID"][i]}_sess-d{d}_echo-1_{scan_type}.nii.gz',

                            f'sub-{dataset["OASISID"][i]}_sess-d{d}_acq-TSE_echo-2_{scan_type}.nii.gz',
                            f'sub-{dataset["OASISID"][i]}_sess-d{d}_acq-TSE_echo-1_{scan_type}.nii.gz',

                            f'sub-{dataset["OASISID"][i]}_ses-d{d}_acq-TSE_echo-2_{scan_type}.nii.gz',
                            f'sub-{dataset["OASISID"][i]}_ses-d{d}_acq-TSE_echo-1_{scan_type}.nii.gz',

                            f'sub-{dataset["OASISID"][i]}_ses-d{d}_acq-TSE_{scan_type}.nii.gz',
                            f'sub-{dataset["OASISID"][i]}_sess-d{d}_acq-TSE_{scan_type}.nii.gz']

                file_found = False
                for tw_file in tw_files:
                    tw_path = os.path.join(full_path, tw_file)
                    if os.path.exists(tw_path):
                        data_dict.append({'image': tw_path, 
                                          'label': dataset['label'][i], 
                                          'folder_scan': dataset['folder_scan'][i],
                                          'ternary_label': dataset['ternary_label'],
                                          'CDR': dataset['CDRTOT'][i]
                                          })
                        file_found = True
                        break

                # if file is not found, search if there are more than one iterations on scans
                if file_found == False:
                    files = os.listdir(full_path)
                    runs = []

                    for file in files:
                        if not file.startswith('.'):
                            regex_max_value = f'_run-(\d+)_{scan_type}'
                            yn = re.search(regex_max_value, file)
                            if yn:
                                runs.append(int(yn.group(1))) 
                    if runs:
                        # choose the most recent run
                        y=str(max(runs))
                                        
                    
                    chosen_files = [f'sub-{dataset["OASISID"][i]}_ses-d{d}_run-0{y}_{scan_type}.nii.gz', 
                                    f'sub-{dataset["OASISID"][i]}_sess-d{d}_run-0{y}_{scan_type}.nii.gz',

                                    f'sub-{dataset["OASISID"][i]}_ses-d{d}_echo-2_run-0{y}_{scan_type}.nii.gz',
                                    f'sub-{dataset["OASISID"][i]}_ses-d{d}_echo-1_run-0{y}_{scan_type}.nii.gz',
                                    
                                    f'sub-{dataset["OASISID"][i]}_ses-d{d}_acq-TSE_run-0{y}_{scan_type}.nii.gz',
                                    f'sub-{dataset["OASISID"][i]}_sess-d{d}_acq-TSE_run-0{y}_{scan_type}.nii.gz']

                    final_paths = [os.path.join(full_path, file) for file in chosen_files[:6]]

                    file_found = False
                    for paths in final_paths:
                        if os.path.exists(paths):
                            data_dict.append({'image': paths, 
                                    'label': dataset['label'][i], 
                                    'folder_scan': dataset['folder_scan'][i],
                                    'ternary_label': dataset['ternary_label'],
                                    'CDR': dataset['CDRTOT'][i]})
                            file_found=True
                            break
                        
                    if not file_found:
                        folders_not_founded.append(final_paths[0])
        else:
            print("Error: folder not found")
    if verbose == True:
        print(f"Scans ({scan_type}) not included in the data: {folders_not_founded}") 
        print('\nLenght of the input set of data:', len(dataset), '\nScans founded:', len(data_dict))
    return data_dict


def get_both_scans(dataset, verbose, path):

    days_regex = r"d(\d{4})"
    
    t1w_path = '' 
    t1w_file = ''
    folders_not_founded=[]
    data_dict = []

    y=0
    types = ['T1w', 'T2w']

    for i in range(0, len(dataset)):
        # check if the patients belongs to OASIS3/OASIS4
        dataset_type = (re.match(r'OAS(\d)', dataset['OASISID'][i])).group(1)
        # Folder of the scans
        full_path = os.path.join(path, dataset['folder_scan'][i])
        if os.path.exists(full_path):
            day_match = re.search(days_regex, dataset['folder_scan'][i])
            d = day_match.group(1) if day_match else None
            if d:   
                file_names = []
                file_names.append(dataset['OASISID'][i])
                file_names.append(dataset['folder_scan'][i])
                file_names.append(dataset['ternary_label'][i])
                file_names.append(dataset['label'][i])
                file_names.append(dataset['folder_scan'][i])
                file_names.append(dataset['CDRTOT'][i])

                for scan_type in types:      
                    t1w_files = [f'sub-{dataset["OASISID"][i]}_ses-d{d}_{scan_type}.nii.gz',
                                f'sub-{dataset["OASISID"][i]}_sess-d{d}_{scan_type}.nii.gz',
                                f'sub-{dataset["OASISID"][i]}_ses-d{d}_echo-2_{scan_type}.nii.gz',
                                f'sub-{dataset["OASISID"][i]}_ses-d{d}_echo-1_{scan_type}.nii.gz',

                                f'sub-{dataset["OASISID"][i]}_sess-d{d}_echo-2_{scan_type}.nii.gz',
                                f'sub-{dataset["OASISID"][i]}_sess-d{d}_echo-1_{scan_type}.nii.gz',

                                f'sub-{dataset["OASISID"][i]}_sess-d{d}_acq-TSE_echo-2_{scan_type}.nii.gz',
                                f'sub-{dataset["OASISID"][i]}_sess-d{d}_acq-TSE_echo--1_{scan_type}.nii.gz',

                                f'sub-{dataset["OASISID"][i]}_ses-d{d}_acq-TSE_echo-2_{scan_type}.nii.gz',
                                f'sub-{dataset["OASISID"][i]}_ses-d{d}_acq-TSE_echo--1_{scan_type}.nii.gz',

                                f'sub-{dataset["OASISID"][i]}_ses-d{d}_acq-TSE_{scan_type}.nii.gz',
                                f'sub-{dataset["OASISID"][i]}_sess-d{d}_acq-TSE_{scan_type}.nii.gz']

                    file_found = False
                    for t1w_file in t1w_files:
                        t1w_path = os.path.join(full_path, t1w_file)
                        if os.path.exists(t1w_path):
                            file_names.append(t1w_path)
                            file_found = True
                            break

                    # if file is not found, search if there are more than one iterations on scans
                    if file_found == False:
                        files = os.listdir(full_path)
                        runs = []

                        for file in files:
                            if not file.startswith('.'):
                                regex_max_value = f'_run-(\d+)_{scan_type}'
                                yn = re.search(regex_max_value, file)
                                if yn:
                                    runs.append(int(yn.group(1))) 
                        if runs:
                            # Choose the most recent run
                            y=str(max(runs))
                                          
                        chosen_files = [f'sub-{dataset["OASISID"][i]}_ses-d{d}_run-0{y}_{scan_type}.nii.gz', 
                                        f'sub-{dataset["OASISID"][i]}_sess-d{d}_run-0{y}_{scan_type}.nii.gz',

                                        f'sub-{dataset["OASISID"][i]}_ses-d{d}_echo-2_run-0{y}_{scan_type}.nii.gz',
                                        f'sub-{dataset["OASISID"][i]}_ses-d{d}_echo-1_run-0{y}_{scan_type}.nii.gz',
                                        
                                        f'sub-{dataset["OASISID"][i]}_ses-d{d}_acq-TSE_run-0{y}_{scan_type}.nii.gz',
                                        f'sub-{dataset["OASISID"][i]}_sess-d{d}_acq-TSE_run-0{y}_{scan_type}.nii.gz']

                        final_paths = [os.path.join(full_path, file) for file in chosen_files[:6]]

                        file_found = False
                        for paths in final_paths:
                            if os.path.exists(paths):
                                file_names.append(paths)
                                file_found=True
                                break 
                        if not file_found:
                            folders_not_founded.append(final_paths[0])
                        
                data_dict.append({'OASISID': file_names[0], 
                                  'folder': file_names[1], 
                                  'ternary_label': file_names[2], 
                                  'label': file_names[3], 
                                  'folder_scan': file_names[4], 
                                  'CDRTOT':file_names[5], 
                                  'T1w': file_names[6], 
                                  'T2w': file_names[7]})
        else:   
            print("Error: folder not found")
    return data_dict

In [5]:
data = pd.read_csv('C:/Users/Vito/Desktop/Magistrale/dataset tesi/df_with_labels.csv')
data['label'].value_counts()

data_t1w = get_scan_dictionary(data, 'T1w', verbose=False)
data_t2w = get_scan_dictionary(data, 'T2w', verbose=False)

t1w_keys = set([entry['image'].split("\\")[-2] for entry in data_t1w])
t2w_keys = set([entry['image'].split("\\")[-2] for entry in data_t2w])

scan_to_discart1 = t1w_keys - t2w_keys
scan_to_discart2 = t2w_keys - t1w_keys

print("Number of scans to discart (t1w_keys - t2w_keys):", len(scan_to_discart1))
print("Number of scans to discart (t2w_keys - t1w_keys):", len(scan_to_discart2))

Number of scans to discart (t1w_keys - t2w_keys): 8
Number of scans to discart (t2w_keys - t1w_keys): 0


In [6]:
data = data[~data['folder_scan'].isin(scan_to_discart1)]

In [7]:
data['label'].value_counts()

CN       1901
AD        323
ncMCI     245
cMCI      108
Name: label, dtype: int64

Undersampling of the original dataset

In [8]:
features = ['OASISID','OASIS_session_label', 'folder_scan','CDRTOT', 'dataset_type']
undersampled_df = undersample_data(data, features, verbose=True)

Original label distribution:
CN       1901
AD        323
ncMCI     245
cMCI      108
Name: label, dtype: int64

Resampled label distribution:
AD       323
CN       323
ncMCI    245
cMCI     108
Name: label, dtype: int64


In [9]:
def check_duplicates(first_set, second_set):
    test_in_train = first_set['OASISID'].isin(second_set['OASISID']).any()
    train_in_test = second_set['OASISID'].isin(first_set['OASISID']).any()

    if test_in_train or train_in_test:
        duplicates = True
    else:
        duplicates = False
    return duplicates

Take the test set (15% of the original data), stratifying by subjects

In [10]:
train_eval_ratio = 0.85

# how many samples for each label the test set must contain
max_label = undersampled_df['label'].value_counts().max()
test_labels_ratio = int(max_label * (1 - train_eval_ratio))  

random.seed(12)
subjects = undersampled_df['OASISID'].unique().tolist()
random.shuffle(subjects)

test_pat=[]

for label in undersampled_df['label'].unique():     
    class_samples = undersampled_df[undersampled_df['label'] == label]
    unique_class_subjects = class_samples['OASISID'].unique()
    selected_test_subjects = random.sample(list(unique_class_subjects), test_labels_ratio)
    test_pat.extend(selected_test_subjects)

train_subjects = undersampled_df['OASISID'].unique()
train_subjects = train_subjects[~pd.Series(train_subjects).isin(test_pat)]

X_test_set = undersampled_df[undersampled_df['OASISID'].isin(test_pat)]
X_train = undersampled_df[undersampled_df['OASISID'].isin(train_subjects) & 
                          (~undersampled_df['OASISID'].isin(X_test_set['OASISID']))]

X_train = X_train.reset_index(drop=True)
X_test_set = X_test_set.reset_index(drop=True)

print('Test set:\n', X_test_set['label'].value_counts())
print('\n Training set:\n', X_train['label'].value_counts())

print('\n Is the same patient in both sets?:', check_duplicates(X_train, X_test_set))

Test set:
 ncMCI    66
CN       61
AD       56
cMCI     55
Name: label, dtype: int64

 Training set:
 AD       267
CN       262
ncMCI    179
cMCI      53
Name: label, dtype: int64

 Is the same patient in both sets?: False


In [12]:
X_train['set'] = 'train+val'
X_test_set['set'] = 'test'

# df updated in every phase, this will be the final CSV
final_df = pd.concat([X_train, X_test_set])
final_df.reset_index(drop=True, inplace=True)

t1w_paths = get_scan_dictionary(final_df, 'T1w', verbose=False)
t2w_paths = get_scan_dictionary(final_df, 'T2w', verbose=False)

# Add the T1w and T2w scan paths to 'final_df'
t1w_d = {d['folder_scan']: d['image'] for d in t1w_paths if 'folder_scan' in d}
t2w_d = {d['folder_scan']: d['image'] for d in t2w_paths if 'folder_scan' in d}

final_df['T1w'] = final_df['folder_scan'].map(t1w_d)
final_df['T2w'] = final_df['folder_scan'].map(t2w_d)
final_df['aug'] = 0

Oversampling

In [None]:
# function used to decide the name of the folder for the oversampled images
def get_aug_folder_name(string, output_dir):
    match = re.search(r'(OAS([34])(\d{4})_MR_d(\d{4}))', string)
    mr_name = match.group(0) if match else None
    counter_folder=2
    folder_name=f"{mr_name}_{counter_folder}"
    path_name=os.path.join(output_dir, folder_name)

    while os.path.exists(path_name):
        counter_folder = counter_folder + 1
        folder_name = f"{mr_name}_{counter_folder}"
        path_name = os.path.join(output_dir, folder_name)
    return path_name, counter_folder

def get_aug_file_name(string, scan_type,i):
    pattern = r'OAS([34])(\d{4})_MR_d(\d{4})'
    match = re.search(pattern, string)
    d_type, id, days = '', '', ''
    if match:
        d_type = match.group(1)  
        id = match.group(2)  
        days = match.group(3)  
    file_name = f"OAS{d_type}{id}_d{days}_{i}_{scan_type}.nii.gz"
    return file_name

# Oversampling function with transformations

def oversample_scans(
        df,
        transformation_function,
        output_dir,
        class_ratios):
    
    oversampled_data = []
    os.makedirs(output_dir, exist_ok=True)

    # dictionary to track how many times each patient has been used
    patient_selection_count = {pid: 0 for pid in df['OASISID'].unique()}

    # oversample each class
    for class_label, desired_count in class_ratios.items():

        # filter original scans for the current class
        class_samples = df[df['label'] == class_label].to_dict(orient='records')
        current_count = len(class_samples)

        for sample in class_samples:

            if current_count < desired_count:
                print(f"Current label: {class_label} {current_count}/{desired_count}")

                if patient_selection_count[sample['OASISID']] == 0:  # if the patient has not been selected yet
                    set_determinism(seed=np.random.randint(0, 10000))

                    T1w_img_data = nib.load(sample['T1w']).get_fdata()
                    transformed_image1 = transformation_function(MetaTensor(T1w_img_data))
                    oversampled_image_np1 = transformed_image1.numpy()
                    
                    T2w_img_data = nib.load(sample['T2w']).get_fdata()
                    transformed_image2 = transformation_function(MetaTensor(T2w_img_data))
                    oversampled_image_np2 = transformed_image2.numpy()

                    folder_path, i = get_aug_folder_name(sample['T1w'], output_dir)
                    t1w_file_name = get_aug_file_name(sample['T1w'], 'T1w', i)
                    t2w_file_name = get_aug_file_name(sample['T1w'], 'T2w', i)
                    
                    os.makedirs(folder_path, exist_ok=True)
                    t1w_os_path = os.path.join(folder_path, t1w_file_name)
                    t2w_os_path = os.path.join(folder_path, t2w_file_name)

                    nib.save(nib.Nifti1Image(oversampled_image_np1, np.eye(4)), t1w_os_path)
                    nib.save(nib.Nifti1Image(oversampled_image_np2, np.eye(4)), t2w_os_path)

                    oversampled_data.append({
                        'OASISID': sample['OASISID'],
                        'OASIS_session_label': f"{sample['folder_scan']}_{i}",
                        'folder_scan': f"{sample['folder_scan']}_{i}",
                        'CDRTOT': sample['CDRTOT'],
                        'dataset_type': 'aug',
                        'ternary_label': sample['ternary_label'],
                        'label': sample['label'],    
                        'set': 'train+val',                                 
                        'T1w': t1w_os_path, 
                        'T2w': t2w_os_path,
                        'aug': 1
                    })

                    patient_selection_count[sample['OASISID']] += 1
                    current_count+=1
                    clear_output(wait=False) 
                
        # perform additional sampling to reach desired_count
        while current_count < desired_count:
            print(f"Current label: {class_label} {current_count}/{desired_count}")

            # calculate probabilities based on the selection count of the patients
            probabilities = [1 / (patient_selection_count[sample['OASISID']] + 1) for sample in class_samples]
            total_prob = sum(probabilities)
            probabilities = [prob / total_prob for prob in probabilities]

            sample_index = np.random.choice(range(len(class_samples)), p=probabilities)
            sample = class_samples[sample_index]
        
            T1w_img_data = nib.load(sample['T1w']).get_fdata()
            transformed_image1 = transformation_function(MetaTensor(T1w_img_data))
            oversampled_image_np1 = transformed_image1.numpy()
            
            T2w_img_data = nib.load(sample['T2w']).get_fdata()
            transformed_image2 = transformation_function(MetaTensor(T2w_img_data))
            oversampled_image_np2 = transformed_image2.numpy()

            folder_path, i = get_aug_folder_name(sample['T1w'], output_dir)

            t1w_file_name = get_aug_file_name(sample['T1w'], 'T1w', i)
            t2w_file_name = get_aug_file_name(sample['T1w'], 'T2w', i)

            os.makedirs(folder_path, exist_ok=True)

            t1w_os_path = os.path.join(folder_path, t1w_file_name)
            t2w_os_path = os.path.join(folder_path, t2w_file_name)

            nib.save(nib.Nifti1Image(oversampled_image_np1, np.eye(4)), t1w_os_path)
            nib.save(nib.Nifti1Image(oversampled_image_np2, np.eye(4)), t2w_os_path)

            oversampled_data.append({
                'OASISID': sample['OASISID'],
                'OASIS_session_label': f"{sample['folder_scan']}_{i}",
                'folder_scan': f"{sample['folder_scan']}_{i}",
                'CDRTOT': sample['CDRTOT'],
                'dataset_type': 'aug',
                'ternary_label': sample['ternary_label'],
                'label': sample['label'],    
                'set': 'train+val',                                 
                'T1w': t1w_os_path, 
                'T2w': t2w_os_path,
                'aug': 1
            })
            
            patient_selection_count[sample['OASISID']] += 1
            current_count += 1
            clear_output(wait=False) 

    return oversampled_data

In [None]:
def get_file_name(string, scan_type):
    pattern = r'OAS([34])(\d{4})_MR_d(\d{4})'
    match = re.search(pattern, string)
    d_type, id, days = '', '', ''
    if match:
        d_type = match.group(1)  
        id = match.group(2)  
        days = match.group(3)  
    file_name = f"OAS{d_type}{id}_d{days}_{scan_type}.nii.gz"
    return file_name

# Transformations for the 'not augmented' files
def train_eval_scans_transformation(
        df,
        transformation_function,
        output_dir,
        set):
    
    train_data_dict=[]
    os.makedirs(output_dir, exist_ok=True)

    created_folders=0
    total_folders=len(df)
    folders_not_added=0

    for index, sample in df.iterrows():
        set_determinism(seed=np.random.randint(0, 10000))

        T1w_img_data = nib.load(sample['T1w']).get_fdata()
        transformed_image1 = transformation_function(MetaTensor(T1w_img_data))
        oversampled_image_np1 = transformed_image1.numpy()
        
        T2w_img_data = nib.load(sample['T2w']).get_fdata()
        transformed_image2 = transformation_function(MetaTensor(T2w_img_data))
        oversampled_image_np2 = transformed_image2.numpy()

        folder_path = os.path.join(output_dir, sample['folder_scan'])
        try:
            os.makedirs(folder_path)
            print(f"Directory '{folder_path}' created: {created_folders}/{total_folders}")
        except FileExistsError:
            print(f"Directory '{folder_path}' already exists.")
            folders_not_added=folders_not_added+1

        t1w_file_name = get_file_name(sample['T1w'], 'T1w')
        t2w_file_name = get_file_name(sample['T1w'], 'T2w')

        t1w_os_path = os.path.join(folder_path, t1w_file_name)
        t2w_os_path = os.path.join(folder_path, t2w_file_name)

        nib.save(nib.Nifti1Image(oversampled_image_np1, np.eye(4)), t1w_os_path)
        nib.save(nib.Nifti1Image(oversampled_image_np2, np.eye(4)), t2w_os_path)

        train_data_dict.append({'OASISID': sample['OASISID'],
                        'OASIS_session_label': sample['folder_scan'],
                        'folder_scan': sample['folder_scan'],
                        'CDRTOT': sample['CDRTOT'],
                        'dataset_type': 'aug',
                        'ternary_label': sample['ternary_label'],
                        'label': sample['label'],    
                        'set': set,                                 
                        'T1w': t1w_os_path, 
                        'T2w': t2w_os_path,
                        'aug': 0})
        created_folders=created_folders+1
        clear_output(wait=False) 
    print(f"Operation completed! {created_folders} folders created, {folders_not_added} discarted")
    return train_data_dict

In [None]:
def get_transforms(size):
    oversampling_transform = Compose([
        EnsureChannelFirst(channel_dim='no_channel'),
        Orientation(axcodes='RAS'), 
        Spacing((1.0, 1.0, 1.0), mode='bilinear', align_corners=True, scale_extent=True),
        ScaleIntensity(channel_wise=True),
        CropForeground(select_fn=(lambda x: x > 0.3), allow_smaller=True),
        Resize(spatial_size=size, size_mode='longest', mode='bilinear', align_corners=True),
        SpatialPad(spatial_size=size, mode='minimum'),
        # apply randrotate, randflip (or both) with p=1.0 
        OneOf([
            RandRotate(
                prob=1.0, 
                range_x=0.4, # [-23, 23] degrees
                range_y=0.4,
                range_z=0.4,
                padding_mode='zeros',
                align_corners=True),
            # mirroring
            RandAxisFlip(prob=1.0)], log_stats=True), 
        NormalizeIntensity(nonzero=True, channel_wise=True),
        RandScaleIntensity(factors=0.1, prob=1.0),
        RandShiftIntensity(offsets=0.1, prob=1.0)])

    train_transform = Compose([
        EnsureChannelFirst(channel_dim='no_channel'),
        Orientation(axcodes='RAS'), 
        Spacing((1.0, 1.0, 1.0), mode='bilinear', align_corners=True, scale_extent=True),
        ScaleIntensity(channel_wise=True),
        CropForeground(select_fn=(lambda x: x > 0.3), allow_smaller=True),
        Resize(spatial_size=size, size_mode='longest', mode='bilinear', align_corners=True),
        SpatialPad(spatial_size=size, mode='minimum'),
        # apply randrotate, randflip (or both) with p=1.0 
        RandRotate(
            prob=0.2, 
            range_x=0.4, # [-23, 23] degrees
            range_y=0.4,
            range_z=0.4,
            padding_mode='zeros',
            align_corners=True),
            # mirroring
        RandAxisFlip(prob=0.2), 
        NormalizeIntensity(nonzero=True, channel_wise=True),
        RandScaleIntensity(factors=0.1, prob=1.0),
        RandShiftIntensity(offsets=0.1, prob=1.0)])
    
    eval_transform = Compose([
        EnsureChannelFirst(channel_dim='no_channel'),
        Orientation(axcodes='RAS'), 
        Spacing((1.0, 1.0, 1.0), mode='bilinear', align_corners=True, scale_extent=True),
        ScaleIntensity(channel_wise=True),
        CropForeground(select_fn=(lambda x: x > 0.3), allow_smaller=True),
        Resize(spatial_size=size, size_mode='longest', mode='bilinear', align_corners=True),
        SpatialPad(spatial_size=size, mode='minimum'),
        NormalizeIntensity(nonzero=True, channel_wise=True),
        RandScaleIntensity(factors=0.1, prob=1.0),
        RandShiftIntensity(offsets=0.1, prob=1.0)])

    return oversampling_transform, train_transform, eval_transform

In [23]:
oversample_func, train_transform, eval_transform = get_transforms(128)

In [None]:
# Oversampling

output_dir_path = 'C:/Users/Vito/Desktop/Final Dataset/train_eval'
train_df = final_df[final_df['set'] == 'train+val']
class_counts = train_df['label'].value_counts().to_dict()
max_desired_labels = max(class_counts.values())
class_ratio = {class_label: max_desired_labels  for class_label, count in class_counts.items()}

"""
NOTE: uncomment to perform oversampling
oversampled_dict = oversample_scans(
        df=train_df,
        transformation_function=oversample_func,
        output_dir=output_dir_path,
        class_ratios=class_ratio)

oversampled_df = pd.DataFrame(oversampled_dict)
path_os = 'C:/Users/Vito/Desktop/Final Dataset/oversampled_df.csv'
oversampled_df.to_csv(path_os, index=False)
"""

"\noversampled_dict = oversample_scans(\n        df=train_df,\n        transformation_function=oversample_func,\n        output_dir=output_dir_path,\n        class_ratios=class_ratio)\n\noversampled_df = pd.DataFrame(oversampled_dict)\npath_os = 'C:/Users/Vito/Desktop/Final Dataset/oversampled_df.csv'\noversampled_df.to_csv(path_os, index=False)\n"

In [24]:
oversampled_df = pd.read_csv('C:/Users/Vito/Desktop/Final Dataset/oversampled_df.csv')
train_eval_set_df = pd.concat([train_df, oversampled_df])
train_eval_set_df['label'].value_counts()

AD       267
CN       267
cMCI     267
ncMCI    267
Name: label, dtype: int64

In [None]:
# Unique patients for this set
train_eval_set_df.groupby('label')['OASISID'].nunique()

label
AD       262
CN       240
cMCI      51
ncMCI    148
Name: OASISID, dtype: int64

In [None]:
# Split traning and evaluation set
def split_train_val(df, verbose):

    val_labels_ratio = 40 # how many samples for each label the test set must contain (267*0.15)
    random.seed(32)
    subjects = df['OASISID'].unique().tolist()
    random.shuffle(subjects)
    eval_patients=[]

    #cMCI is a particular case in which the patients are few since it has been oversampled. So less patients have been taken.
    for label in df['label'].unique():     
        if label != 'cMCI':
            class_samples = df[df['label'] == label]
            unique_class_subjects = class_samples['OASISID'].unique()
            selected_val_subjects = random.sample(list(unique_class_subjects), val_labels_ratio)
            eval_patients.extend(selected_val_subjects)
        elif label == 'cMCI':    
            class_samples = df[df['label'] == label]
            unique_class_subjects = class_samples['OASISID'].unique()
            selected_val_subjects = random.sample(list(unique_class_subjects), 10)
            eval_patients.extend(selected_val_subjects)

    train_subj = df['OASISID'].unique()
    train_subj = train_subj[~pd.Series(train_subj).isin(eval_patients)]

    X_eval = df[df['OASISID'].isin(eval_patients)]
    X_training = df[df['OASISID'].isin(train_subj) & 
                            (~df['OASISID'].isin(X_eval['OASISID']))]

    X_eval = X_eval.reset_index(drop=True)
    X_training = X_training.reset_index(drop=True)
    if verbose==True:
        print('Val set:\n', X_eval['label'].value_counts())
        print('\n Training set:\n', X_training['label'].value_counts())
        print('\n Is the same patient in both sets?:', check_duplicates(X_training, X_eval))

    return X_training, X_eval

train_set_bal, val_set_bal = split_train_val(train_eval_set_df, verbose=True)

Val set:
 ncMCI    77
cMCI     63
CN       49
AD       46
Name: label, dtype: int64

 Training set:
 AD       221
CN       218
cMCI     204
ncMCI    190
Name: label, dtype: int64

 Is the same patient in both sets?: False


In [31]:
train_set_bal['set'] = 'train'
features_tr = ['OASISID','OASIS_session_label','folder_scan','CDRTOT','dataset_type','ternary_label','set','T1w','T2w','aug']
train_set_bal_os = undersample_final_dataset(train_set_bal,features_tr, verbose=True )

Original label distribution:
AD       221
CN       218
cMCI     204
ncMCI    190
Name: label, dtype: int64

Resampled label distribution:
AD       190
CN       190
cMCI     190
ncMCI    190
Name: label, dtype: int64


In [34]:
val_set_bal['set'] = 'val'
test_set_final = final_df[final_df['set'] == 'test']

final_set_df = pd.concat([train_set_bal_os, val_set_bal, test_set_final])
final_set_df.to_csv('C:/Users/Vito/Desktop/Final Dataset/train_eval_test_set.csv', index=False)

In [35]:
train_set_filtered  = train_set_bal[train_set_bal['dataset_type'].isin(['o3', 'o4'])]
train_set_filtered = train_set_filtered.reset_index(drop=True)

Functions used to move all the MRI scan files in a folder

In [127]:
"""
train_eval_dir = 'C:/Users/Vito/Desktop/Final Dataset/train_eval'
test_dir = 'C:/Users/Vito/Desktop/Final Dataset/test_set'

train_set_dict = train_eval_scans_transformation(
        train_set_filtered,
        train_transform,
        train_eval_dir,
        'train')
"""

Operation completed! 606 folders created, 0 discarted


In [None]:
"""
dict1=pd.DataFrame(train_set_dict)
dict2=pd.DataFrame(val_set_dict)
df_concat = pd.concat([dict1, dict2], axis=0)
df_concat.to_csv('C:/Users/Vito/Desktop/Final Dataset/dict_train_eval.csv', index=False)
"""

In [134]:
"""
val_set_filtered  = val_set_bal[val_set_bal['dataset_type'].isin(['o3', 'o4'])]
val_set_filtered = val_set_filtered.reset_index(drop=True)

val_set_dict = train_eval_scans_transformation(
        val_set_filtered,
        eval_transform,
        train_eval_dir,
        'val')
"""

Operation completed! 155 folders created, 0 discarted


In [None]:
"""
test_dir = 'C:/Users/Vito/Desktop/Final Dataset/test_set'
test_set_final = test_set_final.reset_index(drop=True)

test_set_dict = train_eval_scans_transformation(
        test_set_final,
        eval_transform,
        test_dir,
        'test')
"""

Operation completed! 238 folders created, 0 discarted


In [None]:
from pathlib import Path
import pandas as pd
import re
import os
new_data = pd.read_csv('c:/Users/Vito/Desktop/Final Dataset/metadata.csv')
pattern1 = r'd(\d{4})_(\d{1,2})'
pattern2 = r'd(\d+)'

def modify_path(row, path_type):
    base_train_eval = '/kaggle/input/oasis-final/Final Dataset/Final Dataset/train_val_set'
    base_test = '/kaggle/input/oasis-final/Final Dataset/Final Dataset/test_set'

    d = None
    duplicates = None

    matches1 = re.findall(pattern1, row['folder_scan'])
    matches2 = re.search(pattern2, row['folder_scan']).group(1)


    if matches1:
        for match in matches1:
            d = match[0] 
            duplicates = match[1] 
    elif matches2:
            d = matches2

    if row['set'] in ['train', 'val']:

        if duplicates != None:
            string=f"{row['OASISID']}_d{d}_{duplicates}_{path_type}.nii"
        else:
            string=f"{row['OASISID']}_d{d}_{path_type}.nii"

        return f"{base_train_eval}/{row['folder_scan']}/{string}"
    
    elif row['set'] == 'test':
        string=f"{row['OASISID']}_d{d}_{path_type}.nii"
        return f"{base_test}/{row['folder_scan']}/{string}"

new_data['T1w_path'] = new_data.apply(lambda row: modify_path(row, 'T1w'), axis=1)
new_data['T2w_path'] = new_data.apply(lambda row: modify_path(row, 'T2w'), axis=1)
new_data.drop(columns=['T1w', 'T2w'], inplace=True)
new_data.to_csv('c:/Users/Vito/Desktop/Final Dataset/metadata_updated.csv', index=False)