### In this notebook we want to check the compatibility of the numerical and image data from ADNI

In [1]:
# Load the required packages 
import pandas as pd 
import numpy as np 
import sys
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
sys.path.append('../')


In [2]:
# Load functions to get the dataset and process it 
from preprocessing.getdata import get_csvdata, get_tadpole, drop_tadpole, col_tadpole, get_csvdata_ADNI

In [3]:
# Load the Freesurfer dataframe 
df_img=get_csvdata_ADNI(drop_MCI=False)

In [4]:
df_img.head(2)

Unnamed: 0,Image Data ID,ID,Group,Sex,Age,Visit,Modality,Description,Type,Acq Date,Format,Downloaded,label,dataset
0,I167590,002_S_0295,CN,M,85,1,MRI,FreeSurfer Cross-Sectional Processing brainmask,Post-processed,4/18/2006,MGH,8/23/2022,0,ADNI
2,I252117,002_S_0413,CN,F,76,1,MRI,FreeSurfer Cross-Sectional Processing brainmask,Post-processed,5/02/2006,MGH,8/23/2022,0,ADNI


In [5]:
df_img.shape

(831, 14)

In [6]:
#Load the dataframe from the Tadpole challenge 
df_num = get_tadpole(drop_MCI=False)

  df_num = get_tadpole(drop_MCI=False)


In [7]:
df_num.head(2)

Unnamed: 0,RID,ID,VISCODE,SITE,COLPROT,ORIGPROT,EXAMDATE,DX_bl,AGE,PTGENDER,...,PTAU_bl,FDG_bl,PIB_bl,AV45_bl,Years_bl,Month_bl,Month,M,update_stamp,label
0,2,011_S_0002,bl,11,ADNI1,ADNI1,2005-09-08,CN,74.3,Male,...,,1.36665,,,0.0,0.0,0,0,2019-02-14 23:58:27.0,0
1,3,011_S_0003,bl,11,ADNI1,ADNI1,2005-09-12,AD,81.3,Male,...,22.83,1.08355,,,0.0,0.0,0,0,2019-02-14 23:58:27.0,1


In [8]:
df_num.shape

(819, 114)

In [9]:
# Merge the two dataframes on the ID column and find out how many entries overlap 

combined = pd.merge(df_img, df_num, how='inner', on='ID')

In [10]:
combined.head(10)

Unnamed: 0,Image Data ID,ID,Group,Sex,Age,Visit,Modality,Description,Type,Acq Date,...,PTAU_bl,FDG_bl,PIB_bl,AV45_bl,Years_bl,Month_bl,Month,M,update_stamp,label_y
0,I167590,002_S_0295,CN,M,85,1,MRI,FreeSurfer Cross-Sectional Processing brainmask,Post-processed,4/18/2006,...,34.73,,,,0.0,0.0,0,0,2019-02-14 23:58:29.0,0
1,I252117,002_S_0413,CN,F,76,1,MRI,FreeSurfer Cross-Sectional Processing brainmask,Post-processed,5/02/2006,...,10.57,,,,0.0,0.0,0,0,2019-02-14 23:58:30.0,0
2,I252119,002_S_0559,CN,M,79,1,MRI,FreeSurfer Cross-Sectional Processing brainmask,Post-processed,5/23/2006,...,15.82,,,,0.0,0.0,0,0,2019-02-14 23:58:31.0,0
3,I173018,002_S_0619,AD,M,78,1,MRI,FreeSurfer Cross-Sectional Processing brainmask,Post-processed,6/01/2006,...,18.56,,,,0.0,0.0,0,0,2019-02-14 23:58:31.0,1
4,I172953,002_S_0685,CN,F,90,1,MRI,FreeSurfer Cross-Sectional Processing brainmask,Post-processed,7/06/2006,...,22.82,,,,0.0,0.0,0,0,2019-02-14 23:58:32.0,0
5,I177712,002_S_0729,MCI,F,65,1,MRI,FreeSurfer Cross-Sectional Processing brainmask,Post-processed,7/17/2006,...,31.69,,,,0.0,0.0,0,0,2019-02-14 23:58:32.0,0
6,I177728,002_S_0782,MCI,M,82,1,MRI,FreeSurfer Cross-Sectional Processing brainmask,Post-processed,8/14/2006,...,,,,,0.0,0.0,0,0,2019-02-14 23:58:32.0,0
7,I177744,002_S_0816,AD,M,71,1,MRI,FreeSurfer Cross-Sectional Processing brainmask,Post-processed,8/30/2006,...,27.24,,,,0.0,0.0,0,0,2019-02-14 23:58:32.0,1
8,I178763,002_S_0938,AD,F,82,1,MRI,FreeSurfer Cross-Sectional Processing brainmask,Post-processed,10/05/2006,...,,,,,0.0,0.0,0,0,2019-02-14 23:58:33.0,1
9,I210021,002_S_0954,MCI,F,69,1,MRI,FreeSurfer Cross-Sectional Processing brainmask,Post-processed,10/10/2006,...,34.65,,,,0.0,0.0,0,0,2019-02-14 23:58:33.0,0


In [11]:
combined.shape

(808, 127)

In [12]:
combined.columns

Index(['Image Data ID', 'ID', 'Group', 'Sex', 'Age', 'Visit', 'Modality',
       'Description', 'Type', 'Acq Date',
       ...
       'PTAU_bl', 'FDG_bl', 'PIB_bl', 'AV45_bl', 'Years_bl', 'Month_bl',
       'Month', 'M', 'update_stamp', 'label_y'],
      dtype='object', length=127)

In [13]:
combined = combined[['ID', 'Sex', 'Age', 'DX', 'Ventricles', 'Hippocampus', 'WholeBrain', 'Entorhinal']]

In [14]:
combined.head()

Unnamed: 0,ID,Sex,Age,DX,Ventricles,Hippocampus,WholeBrain,Entorhinal
0,002_S_0295,M,85,CN,39130.0,7068.0,1076350.0,3619.0
1,002_S_0413,F,76,CN,29098.0,6905.0,1052840.0,4133.0
2,002_S_0559,M,79,CN,33656.0,7651.0,1096860.0,3974.0
3,002_S_0619,M,78,Dementia,109900.0,5812.0,1075140.0,2773.0
4,002_S_0685,F,90,CN,39103.0,7194.0,989449.0,4201.0


In [15]:
combined.isnull().sum()

ID               0
Sex              0
Age              0
DX               0
Ventricles       8
Hippocampus    147
WholeBrain       5
Entorhinal     147
dtype: int64

In [16]:
#drop the empty rows in Ventricles and WholeBrain 
combined.dropna(subset=['Ventricles', 'WholeBrain'],inplace=True, axis=0)
combined

Unnamed: 0,ID,Sex,Age,DX,Ventricles,Hippocampus,WholeBrain,Entorhinal
0,002_S_0295,M,85,CN,39130.0,7068.0,1076350.0,3619.0
1,002_S_0413,F,76,CN,29098.0,6905.0,1052840.0,4133.0
2,002_S_0559,M,79,CN,33656.0,7651.0,1096860.0,3974.0
3,002_S_0619,M,78,Dementia,109900.0,5812.0,1075140.0,2773.0
4,002_S_0685,F,90,CN,39103.0,7194.0,989449.0,4201.0
...,...,...,...,...,...,...,...,...
803,941_S_1202,M,78,CN,54143.0,5743.0,1052020.0,2398.0
804,941_S_1203,M,83,CN,30573.0,7142.0,903945.0,3059.0
805,941_S_1295,M,77,MCI,66213.0,,1001320.0,
806,941_S_1311,M,69,MCI,101096.0,6899.0,970229.0,3628.0


In [17]:
combined.isnull().sum()

ID               0
Sex              0
Age              0
DX               0
Ventricles       0
Hippocampus    142
WholeBrain       0
Entorhinal     142
dtype: int64

In [18]:
combined['Imputed']=(combined['Hippocampus'].isna() | combined['Entorhinal'].isna())
combined

Unnamed: 0,ID,Sex,Age,DX,Ventricles,Hippocampus,WholeBrain,Entorhinal,Imputed
0,002_S_0295,M,85,CN,39130.0,7068.0,1076350.0,3619.0,False
1,002_S_0413,F,76,CN,29098.0,6905.0,1052840.0,4133.0,False
2,002_S_0559,M,79,CN,33656.0,7651.0,1096860.0,3974.0,False
3,002_S_0619,M,78,Dementia,109900.0,5812.0,1075140.0,2773.0,False
4,002_S_0685,F,90,CN,39103.0,7194.0,989449.0,4201.0,False
...,...,...,...,...,...,...,...,...,...
803,941_S_1202,M,78,CN,54143.0,5743.0,1052020.0,2398.0,False
804,941_S_1203,M,83,CN,30573.0,7142.0,903945.0,3059.0,False
805,941_S_1295,M,77,MCI,66213.0,,1001320.0,,True
806,941_S_1311,M,69,MCI,101096.0,6899.0,970229.0,3628.0,False


In [19]:
#From the DX column, create a new column called label 
combined["label"] = ((combined["DX"] == "Dementia") | (combined["DX"] == "MCI")).astype(int)

#drop the DX column 
combined.drop(columns=['DX'], axis=1, inplace=True)


In [20]:
#Perform train test split 
X=combined.drop('label', axis=1)
y=combined['label']

X_train,X_test, y_train, y_test=train_test_split(X,y, random_state=42, stratify=y)

In [21]:
X_train

Unnamed: 0,ID,Sex,Age,Ventricles,Hippocampus,WholeBrain,Entorhinal,Imputed
372,037_S_0539,M,74,48100.0,5694.0,933795.0,3038.0,False
485,072_S_1380,M,85,65072.0,6219.0,867814.0,3042.0,False
62,007_S_1248,F,80,59256.0,,877444.0,,True
104,012_S_0720,F,78,50845.0,4023.0,865121.0,1949.0,False
450,062_S_1299,M,72,62355.0,6648.0,1142220.0,3768.0,False
...,...,...,...,...,...,...,...,...
719,131_S_0691,M,64,79862.0,,962038.0,,True
727,133_S_0525,F,70,44058.0,6443.0,870255.0,3651.0,False
624,123_S_0390,M,81,75047.0,,996416.0,,True
101,012_S_0637,F,76,25525.0,6126.0,955639.0,3116.0,False


In [22]:
X_test

Unnamed: 0,ID,Sex,Age,Ventricles,Hippocampus,WholeBrain,Entorhinal,Imputed
667,128_S_0245,F,74,60763.0,8005.0,1039760.0,3022.0,False
687,128_S_1242,F,71,16777.0,7246.0,896521.0,5317.0,False
169,018_S_0450,M,69,35281.0,,1095870.0,,True
724,133_S_0433,F,86,20593.0,7203.0,856263.0,3237.0,False
392,041_S_1010,M,74,54880.0,5166.0,1108880.0,2709.0,False
...,...,...,...,...,...,...,...,...
520,094_S_1188,F,81,33863.0,,963964.0,,True
381,041_S_0262,M,86,25868.0,7975.0,965385.0,3588.0,False
707,130_S_1200,M,85,60313.0,,1125500.0,,True
108,012_S_1009,M,76,49937.0,7202.0,1181100.0,3965.0,False


In [23]:
#Create a new column which specifies if part is in X_train or X_test 
combined['Train']=combined.ID.isin(X_train.ID)
combined

Unnamed: 0,ID,Sex,Age,Ventricles,Hippocampus,WholeBrain,Entorhinal,Imputed,label,Train
0,002_S_0295,M,85,39130.0,7068.0,1076350.0,3619.0,False,0,True
1,002_S_0413,F,76,29098.0,6905.0,1052840.0,4133.0,False,0,False
2,002_S_0559,M,79,33656.0,7651.0,1096860.0,3974.0,False,0,True
3,002_S_0619,M,78,109900.0,5812.0,1075140.0,2773.0,False,1,True
4,002_S_0685,F,90,39103.0,7194.0,989449.0,4201.0,False,0,True
...,...,...,...,...,...,...,...,...,...,...
803,941_S_1202,M,78,54143.0,5743.0,1052020.0,2398.0,False,0,True
804,941_S_1203,M,83,30573.0,7142.0,903945.0,3059.0,False,0,False
805,941_S_1295,M,77,66213.0,,1001320.0,,True,1,True
806,941_S_1311,M,69,101096.0,6899.0,970229.0,3628.0,False,1,False


In [24]:
X_test.shape

(200, 8)

In [25]:
#Use the KNN Imputer to predict the missing values 

col=['Age', 'Ventricles', 'Hippocampus', 'WholeBrain', 'Entorhinal', 'Imputed']
#define imputer
imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')

# fit on the dataset
imputer.fit(X_train.drop(columns=['ID', 'Sex'], axis=1))

# transform X_train und X_test
X_train = pd.DataFrame(imputer.transform(X_train.drop(columns=['ID', 'Sex'], axis=1)), columns=col)
X_test = pd.DataFrame(imputer.transform(X_test.drop(columns=['ID', 'Sex'], axis=1)), columns=col)

In [26]:
X_train=pd.DataFrame(X_train)
X_test=pd.DataFrame(X_test)
X_train.isnull().sum()

Age            0
Ventricles     0
Hippocampus    0
WholeBrain     0
Entorhinal     0
Imputed        0
dtype: int64

In [28]:
#Add column names to X_train and X_test 
col=['Age', 'Ventricles', 'Hippocampus', 'WholeBrain', 'Entorhinal', 'Imputed']

X_train.columns=col
X_test.columns=col


In [30]:
#Save the different dataframes 
combined.to_csv('../modelling/df_ADNI_modelling.csv')
X_train.to_csv('../modelling/X_train_ADNI_modelling.csv')
X_test.to_csv('../modelling/X_test_ADNI_modelling.csv')