In [2]:
import pandas as pd
from collections import Counter
import os
import re 

# CSV 
file_path_o3 = "C:/Users/Vito/Desktop/Magistrale/dataset tesi/OASIS3/oasis_3/meta/OASIS3_data_files/UDSb4/csv/OASIS3_UDSb4_cdr.csv"
file_path_o4 = "C:/Users/Vito/Desktop/Magistrale/dataset tesi/OASIS4/oasis_4/meta/OASIS4_data_files/CDR/csv/OASIS4_data_CDR.csv"

# MRI and PET
mri_file_path_o3 = 'C:/Users/Vito/Desktop/Magistrale/dataset tesi/OASIS3/oasis_3/mri'
pup_file_path_o3 = 'C:/Users/Vito/Desktop/Magistrale/dataset tesi/OASIS3/oasis_3/pup'
mri_file_path_o4 = 'C:/Users/Vito/Desktop/Magistrale/dataset tesi/OASIS4/oasis_4/data'

# Datasets loading
data_o3 = pd.read_csv(file_path_o3)
data_o4 = pd.read_csv(file_path_o4)

In [3]:
# Label assignment of CN, cMCI, ncMCI and AD
def assign_label_converter(dataset, dataset_type):

    high_values = [1.0, 2.0, 3.0]
    low_values = [0.0, 0.5]

    scan_data = dataset.copy()
    indices_to_remove = []    # Last and second-last value are removed in the end, since not informative about the progression of the disease
    scan_data['new_label'] = None

    # if using OASIS3, the column having the cdr value is called 'CDRTOT', if OASIS4 is 'cdr'
    if dataset_type=='o3':
        cdr_label = 'CDRTOT'
        id_label = 'OASISID'
    elif dataset_type=='o4':
        cdr_label = 'cdr'
        id_label = 'oasis_id'

    for i in range(0, len(dataset)-2):

        if scan_data[cdr_label][i] == 0.0:
            scan_data.loc[i, 'new_label'] = 'CN'

        elif scan_data[cdr_label][i] >= 1.0:
            scan_data.loc[i, 'new_label'] = 'AD'

        elif scan_data[cdr_label][i] == 0.5:

            #Check if the patient ID is the same, create two lists
            previous_scans = [
                scan_data[cdr_label][i-2] if (i-2) >= len(scan_data) and scan_data[id_label][i-2] == scan_data[id_label][i] else None,
                scan_data[cdr_label][i-1] if (i-1) >= len(scan_data) and scan_data[id_label][i-1] == scan_data[id_label][i] else None]

            next_scans = [
                scan_data[cdr_label][i+1] if (i+1) <= len(scan_data) and scan_data[id_label][i+1] == scan_data[id_label][i] else None,
                scan_data[cdr_label][i+2] if (i+2) <= len(scan_data) and scan_data[id_label][i+2] == scan_data[id_label][i] else None]
            
            previous_scans = [val for val in previous_scans if val is not None]
            next_scans = [val for val in next_scans if val is not None]

            # [2, 1, 0.5, 0.5, 0.5]
            if any(val in high_values for val in previous_scans) and any(val in low_values for val in next_scans):
                scan_data.loc[i, 'new_label'] = 'ncMCI'

            # [0.5, 0.5, 0.5, 1, 2]
            elif any(val in low_values for val in previous_scans) and any(val in high_values for val in next_scans):
                scan_data.loc[i, 'new_label'] = 'cMCI'
            
            # If at least one element of both sides is in [1.0, 2.0, 3.0], assign MCI
            elif any(val in high_values for val in previous_scans) and any(val in high_values for val in next_scans):
                scan_data.loc[i, 'new_label'] = 'cMCI'

            # If at least one element of both sides is in [0.0, 0.5], assign CN
            elif any(val in low_values for val in previous_scans) and any(val in low_values for val in next_scans):
                scan_data.loc[i, 'new_label'] = 'ncMCI'

            # scan in the first position, but It has more than one successive examples, consider only the successive ones
            elif len(previous_scans) < 1 and len(next_scans) >= 1:
                if any(val in low_values for val in next_scans):
                    scan_data.loc[i, 'new_label'] = 'ncMCI'
                if any(val in high_values for val in next_scans):
                    scan_data.loc[i, 'new_label'] = 'cMCI'

            # scan on the last position
            elif len(previous_scans) >= 1 and len(next_scans) < 1:
                if any(val in low_values for val in previous_scans):
                    scan_data.loc[i, 'new_label'] = 'ncMCI'
                if any(val in high_values for val in previous_scans):
                    scan_data.loc[i, 'new_label'] = 'cMCI'
            else:
                scan_data.loc[i, 'new_label'] = 'to_revise'

            
            # if the patient has only one scan, remove it
            if len(next_scans) < 1 and len(previous_scans) < 1:
                indices_to_remove.append(i)
            # if the scan is the last or the second-last, remove it
            elif len(next_scans) <= 1:
                indices_to_remove.append(i)

    scan_data.drop(indices_to_remove, inplace=True)
    scan_data.reset_index(drop=True, inplace=True)
    return scan_data

In [4]:
# Match the MRI scans with the closer metadata within the .csv file

def associate_scans(new_dataset, file_path, scan_type, days, dataset_type):
    folder_scans = [d for d in os.listdir(file_path) if os.path.isdir(os.path.join(file_path, d))]

    if scan_type == 'AV45':
        pattern = r"OAS3(\d{4})_AV45"
    if scan_type == 'PIB':
        pattern = r"OAS3(\d{4})_PIB"

    days_regex = r"d(\d{4})"
    
    if dataset_type=='o3':
        cdr_label = 'CDRTOT'
        id_label = 'OASISID'
        session_label = 'OASIS_session_label'
        patient_name = r"(OAS3\d{4})"
    elif dataset_type=='o4':
        cdr_label = 'cdr'
        id_label = 'oasis_id'
        session_label = 'cdr_id'
        patient_name = r"(OAS4\d{4})"


    if scan_type == 'AV45' or scan_type == 'PIB':
        considered_scans = [s for s in folder_scans if re.search(pattern, s)]
    else:
        considered_scans = folder_scans

    patient_data = []
    cdrtot_dict = {
        row[session_label]: {
            'CDRTOT': row[cdr_label],
            'label': row['new_label']
        }
        for index, row in new_dataset.iterrows()
    }

    for csv_patient_scan in new_dataset[session_label]:
        for folder_scan in considered_scans:

            # Match patient names in both the folder and the CSV scan
            csv_patient_match = re.search(patient_name, csv_patient_scan)
            folder_patient_match = re.search(patient_name, folder_scan)

            if csv_patient_match and folder_patient_match and csv_patient_match.group() == folder_patient_match.group():

                csv_days_match = re.search(days_regex, csv_patient_scan)
                folder_days_match = re.search(days_regex, folder_scan)

                if csv_days_match and folder_days_match:
                    csv_days = int(csv_days_match.group(1))
                    folder_days = int(folder_days_match.group(1))

                    if abs(folder_days - csv_days) <= days:

                        session_data = cdrtot_dict[csv_patient_scan] 
                        cdr = session_data['CDRTOT']
                        label = session_data['label']

                        if not any(scan['folder_scan'] == folder_scan for scan in patient_data):

                            patient_data.append({
                                'OASISID': csv_patient_match.group(1),
                                'OASIS_session_label': csv_patient_scan,
                                'folder_scan': folder_scan,
                                'label': label,
                                'CDRTOT': cdr,
                                'dataset_type': dataset_type
                            })

    return pd.DataFrame(patient_data)

In [5]:
new_d_o3 = assign_label_converter(data_o3, 'o3')

print(Counter(new_d_o3['CDRTOT']) )
print (Counter(new_d_o3['new_label']))

Counter({0.0: 6479, 0.5: 1008, 1.0: 528, 2.0: 155, 3.0: 19, nan: 1})
Counter({'CN': 6479, 'ncMCI': 706, 'AD': 701, 'cMCI': 301, None: 3})


In [6]:
#av45_df_o3 = associate_scans(new_d_o3, pup_file_path_o3, 'AV45', 365)
#pib_df_o3 = associate_scans(new_d_o3, pup_file_path_o3, 'PIB', 365)
mr_df_o3 = associate_scans(new_d_o3, mri_file_path_o3, 'MR', 365, 'o3')

In [28]:
mr_df_o3['label'].value_counts()

CN       1828
ncMCI     203
AD        124
cMCI       86
Name: label, dtype: int64

OASIS4

In [7]:
new_d_o4 = assign_label_converter(data_o4, 'o4')
mr_o4 = associate_scans(new_d_o4, mri_file_path_o4, 'MR', 365, 'o4')
mr_o4['label'].value_counts()

AD       205
CN        73
ncMCI     43
cMCI      23
Name: label, dtype: int64

In [8]:
mr_o4

Unnamed: 0,OASISID,OASIS_session_label,folder_scan,label,CDRTOT,dataset_type
0,OAS42003,OAS42003_CDR_d3000,OAS42003_MR_d3042,AD,1.0,o4
1,OAS42004,OAS42004_CDR_d3000,OAS42004_MR_d3015,AD,2.0,o4
2,OAS42007,OAS42007_CDR_d3476,OAS42007_MR_d3492,cMCI,0.5,o4
3,OAS42009,OAS42009_CDR_d3000,OAS42009_MR_d2904,AD,1.0,o4
4,OAS42010,OAS42010_CDR_d3000,OAS42010_MR_d3091,cMCI,0.5,o4
...,...,...,...,...,...,...
348,OAS42718,OAS42718_CDR_d3000,OAS42718_MR_d3025,AD,1.0,o4
349,OAS42719,OAS42719_CDR_d3007,OAS42719_MR_d3020,AD,2.0,o4
350,OAS42721,OAS42721_CDR_d3000,OAS42721_MR_d3035,cMCI,0.5,o4
351,OAS42726,OAS42726_CDR_d3000,OAS42726_MR_d3014,,0.5,o4


Save the datasets obtained

In [15]:
final_df = pd.concat([mr_df_o3, mr_o4])
final_df = final_df.reindex(columns=['OASISID', 'OASIS_session_label', 'folder_scan', 'CDRTOT', 'dataset_type', 'label'])

In [16]:
final_df

Unnamed: 0,OASISID,OASIS_session_label,folder_scan,CDRTOT,dataset_type,label
0,OAS30001,OAS30001_UDSb4_d0000,OAS30001_MR_d0129,0.0,o3,CN
1,OAS30001,OAS30001_UDSb4_d0722,OAS30001_MR_d0757,0.0,o3,CN
2,OAS30001,OAS30001_UDSb4_d2181,OAS30001_MR_d2430,0.0,o3,CN
3,OAS30001,OAS30001_UDSb4_d3025,OAS30001_MR_d3132,0.0,o3,CN
4,OAS30001,OAS30001_UDSb4_d3675,OAS30001_MR_d3746,0.0,o3,CN
...,...,...,...,...,...,...
348,OAS42718,OAS42718_CDR_d3000,OAS42718_MR_d3025,1.0,o4,AD
349,OAS42719,OAS42719_CDR_d3007,OAS42719_MR_d3020,2.0,o4,AD
350,OAS42721,OAS42721_CDR_d3000,OAS42721_MR_d3035,0.5,o4,cMCI
351,OAS42726,OAS42726_CDR_d3000,OAS42726_MR_d3014,0.5,o4,


In [74]:
path="C:/Users/Vito/Desktop/Magistrale/dataset tesi/df_with_labels.csv"

# delete nan labels
final_df = final_df[final_df['label'].notna()]

#save the df
final_df.to_csv(path, index=False)