### In this notebook we want to check the compatibility of the numerical and image data from ADNI

In [1]:
# Load the required packages 
import pandas as pd 
import numpy as np 
import sys
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
sys.path.append('../')


In [2]:
# Load functions to get the dataset and process it 
from preprocessing.getdata import get_csvdata, get_tadpole, drop_tadpole, col_tadpole, get_csvdata_ADNI

In [3]:
def get_csvdata_ADNI(drop_MCI = True):
    '''
    Loads the .csv dataset and returns a preprocessed dataframe.
        
        Parametes: drop_young (if true, removes entries with age < 60)
        
        Processing steps:
            Sort by Subject ID
            Rename column "Subject" to "ID"
            adds a column "label" 
        
        Returns: the processed Dataframe
    '''
    df = pd.read_csv("../data/FreeSurfer_8_23_2022.csv").sort_values(["Subject","Description"])
    df.rename(columns={"Subject":"ID"}, inplace=True)
    df= df[(df["Description"] != "FreeSurfer Cross-Sectional Processing aparc+aseg") & (df["Description"] != "FreeSurfer Longitudinal Processing aparc+aseg")]
    image_IDs = []
    for i in df["ID"].unique():
        image_IDs.append(df[df["ID"]==i]["Image Data ID"].iloc[0])
    df= df.loc[df["Image Data ID"].isin(image_IDs)]
    #logger.info("ADNI-csv loaded")
    if drop_MCI:
        df= df[(df["Group"] == "AD") | (df["Group"] == "CN")]
        df["label"] = df["Group"] == "AD"
    df["label"] = ((df["Group"] == "AD") | (df["Group"] == "MCI")).astype(int)
    df["dataset"] = "ADNI"
    return df

In [4]:
# Load the Freesurfer dataframe 
df_img=get_csvdata_ADNI(drop_MCI=False)

In [5]:
df_img.head(2)

Unnamed: 0,Image Data ID,ID,Group,Sex,Age,Visit,Modality,Description,Type,Acq Date,Format,Downloaded,label,dataset
0,I167590,002_S_0295,CN,M,85,1,MRI,FreeSurfer Cross-Sectional Processing brainmask,Post-processed,4/18/2006,MGH,8/23/2022,0,ADNI
2,I252117,002_S_0413,CN,F,76,1,MRI,FreeSurfer Cross-Sectional Processing brainmask,Post-processed,5/02/2006,MGH,8/23/2022,0,ADNI


In [6]:
df_img.shape

(831, 14)

In [7]:
#Load the dataframe from the Tadpole challenge 
df_num = get_tadpole(drop_MCI=False)

  df_num = get_tadpole(drop_MCI=False)


In [8]:
df_num.head(2)

Unnamed: 0,RID,ID,VISCODE,SITE,COLPROT,ORIGPROT,EXAMDATE,DX_bl,AGE,PTGENDER,...,PTAU_bl,FDG_bl,PIB_bl,AV45_bl,Years_bl,Month_bl,Month,M,update_stamp,label
0,2,011_S_0002,bl,11,ADNI1,ADNI1,2005-09-08,CN,74.3,Male,...,,1.36665,,,0.0,0.0,0,0,2019-02-14 23:58:27.0,0
1,3,011_S_0003,bl,11,ADNI1,ADNI1,2005-09-12,AD,81.3,Male,...,22.83,1.08355,,,0.0,0.0,0,0,2019-02-14 23:58:27.0,1


In [9]:
df_num.shape

(819, 114)

In [10]:
# Merge the two dataframes on the ID column and find out how many entries overlap 

combined = pd.merge(df_img, df_num, how='inner', on='ID')

In [11]:
combined.head(10)

Unnamed: 0,Image Data ID,ID,Group,Sex,Age,Visit,Modality,Description,Type,Acq Date,...,PTAU_bl,FDG_bl,PIB_bl,AV45_bl,Years_bl,Month_bl,Month,M,update_stamp,label_y
0,I167590,002_S_0295,CN,M,85,1,MRI,FreeSurfer Cross-Sectional Processing brainmask,Post-processed,4/18/2006,...,34.73,,,,0.0,0.0,0,0,2019-02-14 23:58:29.0,0
1,I252117,002_S_0413,CN,F,76,1,MRI,FreeSurfer Cross-Sectional Processing brainmask,Post-processed,5/02/2006,...,10.57,,,,0.0,0.0,0,0,2019-02-14 23:58:30.0,0
2,I252119,002_S_0559,CN,M,79,1,MRI,FreeSurfer Cross-Sectional Processing brainmask,Post-processed,5/23/2006,...,15.82,,,,0.0,0.0,0,0,2019-02-14 23:58:31.0,0
3,I173018,002_S_0619,AD,M,78,1,MRI,FreeSurfer Cross-Sectional Processing brainmask,Post-processed,6/01/2006,...,18.56,,,,0.0,0.0,0,0,2019-02-14 23:58:31.0,1
4,I172953,002_S_0685,CN,F,90,1,MRI,FreeSurfer Cross-Sectional Processing brainmask,Post-processed,7/06/2006,...,22.82,,,,0.0,0.0,0,0,2019-02-14 23:58:32.0,0
5,I177712,002_S_0729,MCI,F,65,1,MRI,FreeSurfer Cross-Sectional Processing brainmask,Post-processed,7/17/2006,...,31.69,,,,0.0,0.0,0,0,2019-02-14 23:58:32.0,0
6,I177728,002_S_0782,MCI,M,82,1,MRI,FreeSurfer Cross-Sectional Processing brainmask,Post-processed,8/14/2006,...,,,,,0.0,0.0,0,0,2019-02-14 23:58:32.0,0
7,I177744,002_S_0816,AD,M,71,1,MRI,FreeSurfer Cross-Sectional Processing brainmask,Post-processed,8/30/2006,...,27.24,,,,0.0,0.0,0,0,2019-02-14 23:58:32.0,1
8,I178763,002_S_0938,AD,F,82,1,MRI,FreeSurfer Cross-Sectional Processing brainmask,Post-processed,10/05/2006,...,,,,,0.0,0.0,0,0,2019-02-14 23:58:33.0,1
9,I210021,002_S_0954,MCI,F,69,1,MRI,FreeSurfer Cross-Sectional Processing brainmask,Post-processed,10/10/2006,...,34.65,,,,0.0,0.0,0,0,2019-02-14 23:58:33.0,0


In [12]:
combined.shape

(808, 127)

In [13]:
combined.columns

Index(['Image Data ID', 'ID', 'Group', 'Sex', 'Age', 'Visit', 'Modality',
       'Description', 'Type', 'Acq Date',
       ...
       'PTAU_bl', 'FDG_bl', 'PIB_bl', 'AV45_bl', 'Years_bl', 'Month_bl',
       'Month', 'M', 'update_stamp', 'label_y'],
      dtype='object', length=127)

In [14]:
combined = combined[['ID', 'Sex', 'Age', 'DX', 'Ventricles', 'Hippocampus', 'WholeBrain', 'Entorhinal']]

In [15]:
combined.head()

Unnamed: 0,ID,Sex,Age,DX,Ventricles,Hippocampus,WholeBrain,Entorhinal
0,002_S_0295,M,85,CN,39130.0,7068.0,1076350.0,3619.0
1,002_S_0413,F,76,CN,29098.0,6905.0,1052840.0,4133.0
2,002_S_0559,M,79,CN,33656.0,7651.0,1096860.0,3974.0
3,002_S_0619,M,78,Dementia,109900.0,5812.0,1075140.0,2773.0
4,002_S_0685,F,90,CN,39103.0,7194.0,989449.0,4201.0


In [16]:
combined.isnull().sum()

ID               0
Sex              0
Age              0
DX               0
Ventricles       8
Hippocampus    147
WholeBrain       5
Entorhinal     147
dtype: int64

In [17]:
combined.isnull().sum()

ID               0
Sex              0
Age              0
DX               0
Ventricles       8
Hippocampus    147
WholeBrain       5
Entorhinal     147
dtype: int64

In [18]:
combined['Imputed']=(combined['Hippocampus'].isna() | combined['Entorhinal'].isna())
combined

Unnamed: 0,ID,Sex,Age,DX,Ventricles,Hippocampus,WholeBrain,Entorhinal,Imputed
0,002_S_0295,M,85,CN,39130.0,7068.0,1076350.0,3619.0,False
1,002_S_0413,F,76,CN,29098.0,6905.0,1052840.0,4133.0,False
2,002_S_0559,M,79,CN,33656.0,7651.0,1096860.0,3974.0,False
3,002_S_0619,M,78,Dementia,109900.0,5812.0,1075140.0,2773.0,False
4,002_S_0685,F,90,CN,39103.0,7194.0,989449.0,4201.0,False
...,...,...,...,...,...,...,...,...,...
803,941_S_1202,M,78,CN,54143.0,5743.0,1052020.0,2398.0,False
804,941_S_1203,M,83,CN,30573.0,7142.0,903945.0,3059.0,False
805,941_S_1295,M,77,MCI,66213.0,,1001320.0,,True
806,941_S_1311,M,69,MCI,101096.0,6899.0,970229.0,3628.0,False


In [19]:
combined.to_csv('../modelling/df_ADNI_multiclass_modelling.csv')

In [20]:
combined["DX"].unique()

array(['CN', 'Dementia', 'MCI'], dtype=object)

In [21]:
#From the DX column, create a new column called label 
combined["label"] = ((combined["DX"] == "Dementia") | (combined["DX"] == "MCI") | (combined["DX"] == "AD")).astype(int)

#drop the DX column 
combined.drop(columns=['DX'], axis=1, inplace=True)


In [22]:
#Perform train test split 
X=combined.drop('label', axis=1)
y=combined['label']

X_train,X_test, y_train, y_test=train_test_split(X,y, random_state=42, stratify=y)

In [23]:
X_train

Unnamed: 0,ID,Sex,Age,Ventricles,Hippocampus,WholeBrain,Entorhinal,Imputed
37,005_S_0814,F,71,68924.0,,876079.0,,True
423,053_S_0919,M,63,31060.0,8425.0,1159000.0,4013.0,False
436,057_S_1265,F,82,18372.0,6135.0,856639.0,2695.0,False
437,057_S_1269,M,74,43176.0,6714.0,1028650.0,3860.0,False
410,052_S_0671,F,61,11480.0,6753.0,957807.0,3046.0,False
...,...,...,...,...,...,...,...,...
537,099_S_0040,M,73,43577.0,7595.0,978138.0,4157.0,False
210,022_S_1394,M,77,59120.0,7398.0,1079730.0,2971.0,False
741,136_S_0186,F,81,48795.0,7774.0,1023240.0,3990.0,False
29,005_S_0223,F,78,34337.0,5565.0,964368.0,1870.0,False


In [39]:
com_train, comb_test = train_test_split(combined, random_state=42, stratify= combined["label"])

In [40]:
com_train

Unnamed: 0,Age,Ventricles,Hippocampus,WholeBrain,Entorhinal,Imputed,ID,Sex,label,Train
597,77.0,59120.0,7398.0,1079730.0,2971.0,0.0,022_S_1394,M,1,True
549,85.0,89593.0,7714.0,1055280.0,5731.0,0.0,052_S_0951,M,1,True
90,77.0,63883.0,5922.0,809713.0,2944.0,0.0,023_S_0093,F,1,False
276,79.0,39315.0,6284.0,978282.0,2449.0,0.0,130_S_0783,F,1,True
518,78.0,48615.0,5159.0,994442.0,2946.0,0.0,099_S_1144,M,1,True
...,...,...,...,...,...,...,...,...,...,...
195,71.0,38374.0,6479.6,964788.0,3064.4,1.0,007_S_0041,F,1,True
173,74.0,11759.0,7189.0,924970.0,3711.0,0.0,128_S_0229,F,0,True
117,78.0,71910.0,4870.0,883921.0,2657.0,0.0,018_S_0406,M,1,False
47,78.0,19606.0,7064.0,1112520.0,3516.0,0.0,137_S_0283,M,0,False


In [42]:
X_train

Unnamed: 0,Age,Ventricles,Hippocampus,WholeBrain,Entorhinal,Imputed,ID,Sex,label,Train
0,71.0,68924.0,5164.8,876079.0,2840.2,1.0,005_S_0814,F,1,True
1,63.0,31060.0,8425.0,1159000.0,4013.0,0.0,053_S_0919,M,1,True
2,82.0,18372.0,6135.0,856639.0,2695.0,0.0,057_S_1265,F,1,True
3,74.0,43176.0,6714.0,1028650.0,3860.0,0.0,057_S_1269,M,1,True
4,61.0,11480.0,6753.0,957807.0,3046.0,0.0,052_S_0671,F,1,True
...,...,...,...,...,...,...,...,...,...,...
596,73.0,43577.0,7595.0,978138.0,4157.0,0.0,099_S_0040,M,1,True
597,77.0,59120.0,7398.0,1079730.0,2971.0,0.0,022_S_1394,M,1,True
598,81.0,48795.0,7774.0,1023240.0,3990.0,0.0,136_S_0186,F,1,True
599,78.0,34337.0,5565.0,964368.0,1870.0,0.0,005_S_0223,F,1,True


In [25]:
#drop the empty rows in Ventricles and WholeBrain 
X_train.dropna(subset=['Ventricles', 'WholeBrain'],inplace=True, axis=0)
X_test.dropna(subset=['Ventricles', 'WholeBrain'],inplace=True, axis=0)


In [26]:
#Create a new column which specifies if part is in X_train or X_test 
combined['Train']=combined.ID.isin(X_train.ID)
combined

Unnamed: 0,ID,Sex,Age,Ventricles,Hippocampus,WholeBrain,Entorhinal,Imputed,label,Train
0,002_S_0295,M,85,39130.0,7068.0,1076350.0,3619.0,False,0,True
1,002_S_0413,F,76,29098.0,6905.0,1052840.0,4133.0,False,0,False
2,002_S_0559,M,79,33656.0,7651.0,1096860.0,3974.0,False,0,True
3,002_S_0619,M,78,109900.0,5812.0,1075140.0,2773.0,False,1,True
4,002_S_0685,F,90,39103.0,7194.0,989449.0,4201.0,False,0,True
...,...,...,...,...,...,...,...,...,...,...
803,941_S_1202,M,78,54143.0,5743.0,1052020.0,2398.0,False,0,True
804,941_S_1203,M,83,30573.0,7142.0,903945.0,3059.0,False,0,False
805,941_S_1295,M,77,66213.0,,1001320.0,,True,1,True
806,941_S_1311,M,69,101096.0,6899.0,970229.0,3628.0,False,1,False


In [27]:
X_test.shape

(199, 8)

In [28]:
X_train.shape

(601, 8)

In [29]:
#Use the KNN Imputer to predict the missing values 

col=['Age', 'Ventricles', 'Hippocampus', 'WholeBrain', 'Entorhinal', 'Imputed']
#define imputer
imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')

# fit on the dataset
imputer.fit(X_train.drop(columns=['ID', 'Sex'], axis=1))


# transform X_train und X_test
X_train_imp = pd.DataFrame(imputer.transform(X_train.drop(columns=['ID', 'Sex'], axis=1)), columns=col)
X_test_imp = pd.DataFrame(imputer.transform(X_test.drop(columns=['ID', 'Sex'], axis=1)), columns=col)


In [30]:
X_train_imp["ID"] = X_train.reset_index(drop=True)["ID"]
X_test_imp["ID"] = X_test.reset_index(drop=True)["ID"]

X_train_imp["Sex"] = X_train.reset_index(drop=True)["Sex"]
X_test_imp["Sex"] = X_test.reset_index(drop=True)["Sex"]

In [31]:
X_test = X_test_imp
X_train = X_train_imp

In [32]:
X_train=pd.DataFrame(X_train)
X_test=pd.DataFrame(X_test)
X_train.isnull().sum()
X_test.isnull().sum()

Age            0
Ventricles     0
Hippocampus    0
WholeBrain     0
Entorhinal     0
Imputed        0
ID             0
Sex            0
dtype: int64

In [33]:
X_train["label"] = y_train.reset_index(drop=True)
X_train["Train"] = True
X_test["label"] = y_test.reset_index(drop=True)
X_test["Train"] = False

In [34]:
X_test

Unnamed: 0,Age,Ventricles,Hippocampus,WholeBrain,Entorhinal,Imputed,ID,Sex,label,Train
0,72.0,25660.0,8308.0,1156120.0,4310.0,0.0,072_S_0315,M,0,False
1,81.0,56627.0,5753.8,905090.0,2973.0,1.0,021_S_0424,M,1,False
2,71.0,44972.0,6997.0,1132390.0,3642.0,1.0,094_S_1090,M,1,False
3,78.0,38478.0,7144.0,925721.0,3871.0,0.0,109_S_1013,M,0,False
4,76.0,75829.0,5831.8,952156.0,3435.0,1.0,130_S_1201,F,1,False
...,...,...,...,...,...,...,...,...,...,...
194,74.0,49959.0,5832.0,1084990.0,3400.0,0.0,037_S_0150,M,1,False
195,78.0,23532.0,5067.0,1021830.0,2770.0,0.0,036_S_0673,M,1,False
196,83.0,68179.0,6171.0,1084750.0,3820.0,0.0,029_S_0836,M,0,False
197,74.0,34372.0,4810.0,983066.0,2456.0,0.0,023_S_0887,F,1,False


In [35]:
combined= pd.concat([X_train,X_test], axis=0)

In [36]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 800 entries, 0 to 198
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          800 non-null    float64
 1   Ventricles   800 non-null    float64
 2   Hippocampus  800 non-null    float64
 3   WholeBrain   800 non-null    float64
 4   Entorhinal   800 non-null    float64
 5   Imputed      800 non-null    float64
 6   ID           800 non-null    object 
 7   Sex          800 non-null    object 
 8   label        800 non-null    int64  
 9   Train        800 non-null    bool   
dtypes: bool(1), float64(6), int64(1), object(2)
memory usage: 63.3+ KB


In [37]:
#Save the different dataframes 
combined.to_csv('../modelling/tables/df_ADNI_modelling.csv', index=False)

In [38]:
combined.sample(10)

Unnamed: 0,Age,Ventricles,Hippocampus,WholeBrain,Entorhinal,Imputed,ID,Sex,label,Train
196,72.0,15289.0,7859.0,1121590.0,3289.0,0.0,027_S_0120,M,1,True
529,72.0,35897.0,7100.0,1139320.0,2931.0,0.0,123_S_0162,M,1,True
78,80.0,28087.0,5561.0,795508.0,2634.0,0.0,057_S_0839,F,1,True
305,85.0,49271.0,6169.0,983626.0,4187.0,0.0,941_S_1194,M,1,True
504,72.0,25444.0,7698.0,1139480.0,3885.0,0.0,128_S_0545,M,0,True
162,80.0,98380.0,4393.0,916457.0,1467.0,0.0,114_S_0228,F,1,True
276,79.0,39315.0,6284.0,978282.0,2449.0,0.0,130_S_0783,F,1,True
365,79.0,49899.0,5081.0,927736.0,2670.0,0.0,132_S_0987,F,1,True
5,78.0,28895.0,5037.2,816578.0,2670.2,1.0,109_S_1114,F,1,False
211,74.0,57735.0,5521.6,971227.0,2668.2,1.0,094_S_0531,F,0,True
