### In this notebook we want to check the compatibility of the numerical and image data from ADNI

In [112]:
# Load the required packages 
import pandas as pd 
import numpy as np 
import sys
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
sys.path.append('../')


In [113]:
# Load functions to get the dataset and process it 
from preprocessing.getdata import get_csvdata, get_tadpole, drop_tadpole, col_tadpole, get_csvdata_ADNI

In [115]:
# Load the Freesurfer dataframe 
df_img=get_csvdata_ADNI(drop_MCI=False)

In [116]:
df_a_train, df_a_test = train_test_split(df_img, stratify=df_img["label"], random_state=42)

In [117]:
df_img.head(2)

Unnamed: 0,Image Data ID,ID,Group,Sex,Age,Visit,Modality,Description,Type,Acq Date,Format,Downloaded,label,dataset
0,I167590,002_S_0295,CN,M,85,1,MRI,FreeSurfer Cross-Sectional Processing brainmask,Post-processed,4/18/2006,MGH,8/23/2022,0,ADNI
2,I252117,002_S_0413,CN,F,76,1,MRI,FreeSurfer Cross-Sectional Processing brainmask,Post-processed,5/02/2006,MGH,8/23/2022,0,ADNI


In [118]:
df_img.shape

(831, 14)

In [119]:
#Load the dataframe from the Tadpole challenge 
df_num = get_tadpole(drop_MCI=False)

  df_num = get_tadpole(drop_MCI=False)


In [120]:
df_num.head(2)

Unnamed: 0,RID,ID,VISCODE,SITE,COLPROT,ORIGPROT,EXAMDATE,DX_bl,AGE,PTGENDER,...,PTAU_bl,FDG_bl,PIB_bl,AV45_bl,Years_bl,Month_bl,Month,M,update_stamp,label
0,2,011_S_0002,bl,11,ADNI1,ADNI1,2005-09-08,CN,74.3,Male,...,,1.36665,,,0.0,0.0,0,0,2019-02-14 23:58:27.0,0
1,3,011_S_0003,bl,11,ADNI1,ADNI1,2005-09-12,AD,81.3,Male,...,22.83,1.08355,,,0.0,0.0,0,0,2019-02-14 23:58:27.0,1


In [121]:
df_num.shape

(819, 114)

In [122]:
# Merge the two dataframes on the ID column and find out how many entries overlap 

combined = pd.merge(df_img, df_num, how='inner', on='ID')
com_train = pd.merge(df_a_train, df_num, how='inner', on='ID')
com_test =pd.merge(df_a_test, df_num, how='inner', on='ID')

In [123]:
combined.shape

(808, 127)

In [124]:
combined.columns

Index(['Image Data ID', 'ID', 'Group', 'Sex', 'Age', 'Visit', 'Modality',
       'Description', 'Type', 'Acq Date',
       ...
       'PTAU_bl', 'FDG_bl', 'PIB_bl', 'AV45_bl', 'Years_bl', 'Month_bl',
       'Month', 'M', 'update_stamp', 'label_y'],
      dtype='object', length=127)

In [125]:
combined = combined[['ID', 'Sex', 'Age', 'DX', 'Ventricles', 'Hippocampus', 'WholeBrain', 'Entorhinal', "MMSE"]]
com_train = com_train[['ID', 'Sex', 'Age', 'DX', 'Ventricles', 'Hippocampus', 'WholeBrain', 'Entorhinal', "MMSE"]]
com_test = com_test[['ID', 'Sex', 'Age', 'DX', 'Ventricles', 'Hippocampus', 'WholeBrain', 'Entorhinal', "MMSE"]]

In [126]:
combined.head()

Unnamed: 0,ID,Sex,Age,DX,Ventricles,Hippocampus,WholeBrain,Entorhinal,MMSE
0,002_S_0295,M,85,CN,39130.0,7068.0,1076350.0,3619.0,28.0
1,002_S_0413,F,76,CN,29098.0,6905.0,1052840.0,4133.0,29.0
2,002_S_0559,M,79,CN,33656.0,7651.0,1096860.0,3974.0,30.0
3,002_S_0619,M,78,Dementia,109900.0,5812.0,1075140.0,2773.0,22.0
4,002_S_0685,F,90,CN,39103.0,7194.0,989449.0,4201.0,30.0


In [127]:
combined.isnull().sum()

ID               0
Sex              0
Age              0
DX               0
Ventricles       8
Hippocampus    147
WholeBrain       5
Entorhinal     147
MMSE             0
dtype: int64

In [128]:
combined.isnull().sum()

ID               0
Sex              0
Age              0
DX               0
Ventricles       8
Hippocampus    147
WholeBrain       5
Entorhinal     147
MMSE             0
dtype: int64

In [129]:
combined['Imputed']=(combined['Hippocampus'].isna() | combined['Entorhinal'].isna())
com_train['Imputed']=(com_train['Hippocampus'].isna() | com_train['Entorhinal'].isna())
com_test['Imputed']=(com_test['Hippocampus'].isna() | com_test['Entorhinal'].isna())
combined

Unnamed: 0,ID,Sex,Age,DX,Ventricles,Hippocampus,WholeBrain,Entorhinal,MMSE,Imputed
0,002_S_0295,M,85,CN,39130.0,7068.0,1076350.0,3619.0,28.0,False
1,002_S_0413,F,76,CN,29098.0,6905.0,1052840.0,4133.0,29.0,False
2,002_S_0559,M,79,CN,33656.0,7651.0,1096860.0,3974.0,30.0,False
3,002_S_0619,M,78,Dementia,109900.0,5812.0,1075140.0,2773.0,22.0,False
4,002_S_0685,F,90,CN,39103.0,7194.0,989449.0,4201.0,30.0,False
...,...,...,...,...,...,...,...,...,...,...
803,941_S_1202,M,78,CN,54143.0,5743.0,1052020.0,2398.0,28.0,False
804,941_S_1203,M,83,CN,30573.0,7142.0,903945.0,3059.0,30.0,False
805,941_S_1295,M,77,MCI,66213.0,,1001320.0,,28.0,True
806,941_S_1311,M,69,MCI,101096.0,6899.0,970229.0,3628.0,29.0,False


In [130]:
combined.to_csv('../modelling/df_ADNI_multiclass_modelling.csv')

In [131]:
combined["DX"].unique()

array(['CN', 'Dementia', 'MCI'], dtype=object)

In [132]:
#From the DX column, create a new column called label 
combined["label"] = ((combined["DX"] == "Dementia") | (combined["DX"] == "MCI") | (combined["DX"] == "AD")).astype(int)
com_train["label"] = ((com_train["DX"] == "Dementia") | (com_train["DX"] == "MCI") | (com_train["DX"] == "AD")).astype(int)
com_test["label"] = ((com_test["DX"] == "Dementia") | (com_test["DX"] == "MCI") | (com_test["DX"] == "AD")).astype(int)

#drop the DX column 
combined.drop(columns=['DX'], axis=1, inplace=True)
com_train.drop(columns=['DX'], axis=1, inplace=True)
com_test.drop(columns=['DX'], axis=1, inplace=True)


In [133]:
#Perform train test split 
X_train =com_train.drop('label', axis=1)
X_test =com_test.drop('label', axis=1)
y_train=com_train['label']
y_test=com_test['label']


In [134]:
X_train

Unnamed: 0,ID,Sex,Age,Ventricles,Hippocampus,WholeBrain,Entorhinal,MMSE,Imputed
0,130_S_0956,F,64,12361.0,,1087730.0,,23.0,True
1,018_S_0043,M,76,33775.0,7590.0,1069420.0,4381.0,29.0,False
2,068_S_0476,F,76,30916.0,6140.0,870528.0,3671.0,26.0,False
3,127_S_1419,M,78,39896.0,7498.0,1181030.0,3979.0,28.0,False
4,023_S_1046,M,72,100947.0,6354.0,1200500.0,2756.0,25.0,False
...,...,...,...,...,...,...,...,...,...
597,136_S_0874,F,66,17187.0,6224.0,1063190.0,2623.0,25.0,False
598,131_S_0691,M,64,79862.0,,962038.0,,24.0,True
599,010_S_0786,M,75,38770.0,5465.0,932074.0,2933.0,26.0,False
600,131_S_0497,M,76,91840.0,6441.0,1087580.0,3757.0,23.0,False


In [135]:
#drop the empty rows in Ventricles and WholeBrain 
X_train.dropna(subset=['Ventricles', 'WholeBrain'],inplace=True, axis=0)
X_test.dropna(subset=['Ventricles', 'WholeBrain'],inplace=True, axis=0)


In [136]:
#Create a new column which specifies if part is in X_train or X_test 
combined['Train']=combined.ID.isin(X_train.ID)
combined

Unnamed: 0,ID,Sex,Age,Ventricles,Hippocampus,WholeBrain,Entorhinal,MMSE,Imputed,label,Train
0,002_S_0295,M,85,39130.0,7068.0,1076350.0,3619.0,28.0,False,0,True
1,002_S_0413,F,76,29098.0,6905.0,1052840.0,4133.0,29.0,False,0,False
2,002_S_0559,M,79,33656.0,7651.0,1096860.0,3974.0,30.0,False,0,True
3,002_S_0619,M,78,109900.0,5812.0,1075140.0,2773.0,22.0,False,1,True
4,002_S_0685,F,90,39103.0,7194.0,989449.0,4201.0,30.0,False,0,True
...,...,...,...,...,...,...,...,...,...,...,...
803,941_S_1202,M,78,54143.0,5743.0,1052020.0,2398.0,28.0,False,0,True
804,941_S_1203,M,83,30573.0,7142.0,903945.0,3059.0,30.0,False,0,False
805,941_S_1295,M,77,66213.0,,1001320.0,,28.0,True,1,False
806,941_S_1311,M,69,101096.0,6899.0,970229.0,3628.0,29.0,False,1,False


In [137]:
X_test.shape

(204, 9)

In [138]:
X_train.shape

(596, 9)

In [142]:
X_train.columns

Index(['ID', 'Sex', 'Age', 'Ventricles', 'Hippocampus', 'WholeBrain',
       'Entorhinal', 'MMSE', 'Imputed'],
      dtype='object')

In [143]:
#Use the KNN Imputer to predict the missing values 

col=['Age', 'Ventricles', 'Hippocampus', 'WholeBrain',
       'Entorhinal', 'MMSE', 'Imputed']
#define imputer
imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')

# fit on the dataset
imputer.fit(X_train.drop(columns=['ID', 'Sex'], axis=1))


# transform X_train und X_test
X_train_imp = pd.DataFrame(imputer.transform(X_train.drop(columns=['ID', 'Sex'], axis=1)), columns=col)
X_test_imp = pd.DataFrame(imputer.transform(X_test.drop(columns=['ID', 'Sex'], axis=1)), columns=col)


In [144]:
X_train_imp["ID"] = X_train.reset_index(drop=True)["ID"]
X_test_imp["ID"] = X_test.reset_index(drop=True)["ID"]

X_train_imp["Sex"] = X_train.reset_index(drop=True)["Sex"]
X_test_imp["Sex"] = X_test.reset_index(drop=True)["Sex"]

In [145]:
X_test = X_test_imp
X_train = X_train_imp

In [146]:
X_train=pd.DataFrame(X_train)
X_test=pd.DataFrame(X_test)
X_train.isnull().sum()
X_test.isnull().sum()

Age            0
Ventricles     0
Hippocampus    0
WholeBrain     0
Entorhinal     0
MMSE           0
Imputed        0
ID             0
Sex            0
dtype: int64

In [147]:
X_train["label"] = y_train.reset_index(drop=True)
X_train["Train"] = True
X_test["label"] = y_test.reset_index(drop=True)
X_test["Train"] = False

In [148]:
X_test

Unnamed: 0,Age,Ventricles,Hippocampus,WholeBrain,Entorhinal,MMSE,Imputed,ID,Sex,label,Train
0,73.0,39367.0,6146.0,850330.0,3707.0,29.0,0.0,114_S_0416,F,0,False
1,73.0,40800.0,5702.0,915118.0,3339.0,24.0,0.0,023_S_1247,F,1,False
2,88.0,79554.0,5338.0,958537.0,3317.0,24.0,0.0,128_S_1088,M,1,False
3,76.0,23794.0,6918.6,924107.0,3651.6,23.0,1.0,126_S_0784,F,1,False
4,87.0,59530.0,6326.2,945561.0,3577.4,26.0,1.0,141_S_0697,M,1,False
...,...,...,...,...,...,...,...,...,...,...,...
199,77.0,33503.0,5471.0,788112.0,3256.0,29.0,0.0,099_S_0352,F,0,False
200,79.0,29915.0,6813.0,928173.0,4048.0,30.0,0.0,027_S_0116,M,0,False
201,72.0,44550.0,5683.0,975848.0,2535.0,22.0,0.0,128_S_0740,M,0,False
202,83.0,29096.0,5946.0,959442.0,3688.0,25.0,0.0,021_S_0626,M,1,False


In [149]:
combined= pd.concat([X_train,X_test], axis=0)

In [150]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 800 entries, 0 to 203
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          800 non-null    float64
 1   Ventricles   800 non-null    float64
 2   Hippocampus  800 non-null    float64
 3   WholeBrain   800 non-null    float64
 4   Entorhinal   800 non-null    float64
 5   MMSE         800 non-null    float64
 6   Imputed      800 non-null    float64
 7   ID           800 non-null    object 
 8   Sex          800 non-null    object 
 9   label        800 non-null    int64  
 10  Train        800 non-null    bool   
dtypes: bool(1), float64(7), int64(1), object(2)
memory usage: 69.5+ KB


In [151]:
#Save the different dataframes 
combined.to_csv('../modelling/tables/df_ADNI_modelling.csv', index=False)

In [152]:
combined.sample(10)

Unnamed: 0,Age,Ventricles,Hippocampus,WholeBrain,Entorhinal,MMSE,Imputed,ID,Sex,label,Train
325,72.0,27286.0,6851.4,1012940.0,3557.6,26.0,1.0,029_S_0878,M,0,True
319,77.0,72975.0,8609.0,1180660.0,3342.0,28.0,0.0,035_S_0555,M,1,True
447,76.0,123032.0,6177.8,1088450.0,3257.2,26.0,1.0,128_S_0167,M,1,True
498,70.0,45989.0,4682.0,870569.0,2226.0,22.0,0.0,027_S_1082,F,1,True
123,72.0,37297.0,4622.0,927419.0,2181.0,29.0,0.0,007_S_0249,F,1,False
234,68.0,18547.0,6841.0,905488.0,3210.0,26.0,0.0,031_S_0321,M,0,True
522,81.0,27225.0,6384.6,913315.0,3710.6,29.0,1.0,052_S_1168,F,1,True
177,88.0,24514.0,5628.0,968563.0,2612.8,20.0,1.0,098_S_0149,M,0,False
19,82.0,99064.0,6307.8,990215.0,2990.6,18.0,1.0,013_S_0699,M,1,True
544,77.0,22877.0,6815.0,1051430.0,3460.0,30.0,0.0,100_S_0035,M,1,True
