### In this notebook we want to check the compatibility of the numerical and image data from ADNI

For this we combine the Freesurfer data for the images with the numerical data from the Tadpole challenge which contains data from brain measurements. 

In [1]:
# Load the required packages 
import pandas as pd 
import numpy as np 
import sys
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
sys.path.append('../')

# Load functions to get the dataset and process it 
from getdata import get_csvdata_OASIS, get_tadpole, drop_tadpole, col_tadpole, get_csvdata_ADNI


In [2]:
# Load the Freesurfer dataframe 
df_img=get_csvdata_ADNI(drop_MCI=False)

In [3]:
#Perform a train-test-split
df_a_train, df_a_test = train_test_split(df_img, stratify=df_img["label"], random_state=42)

In [4]:
df_img.head(2)

Unnamed: 0,Image Data ID,ID,Group,Sex,Age,Visit,Modality,Description,Type,Acq Date,Format,Downloaded,label,dataset
806,I193933,003_S_1057,MCI,F,62,2,PET,"Coreg, warp, norm",Post-processed,2/16/2007,DCM,,1,ADNI
805,I193938,003_S_1059,AD,F,85,2,PET,"Coreg, warp, norm",Post-processed,1/10/2007,DCM,,1,ADNI


In [5]:
df_img.shape

(404, 14)

In [6]:
#Load the dataframe from the Tadpole challenge 
df_num = get_tadpole(drop_MCI=False)

  df_num = get_tadpole(drop_MCI=False)


In [7]:
df_num.head(2)

Unnamed: 0,RID,ID,VISCODE,SITE,COLPROT,ORIGPROT,EXAMDATE,DX_bl,AGE,PTGENDER,...,PTAU_bl,FDG_bl,PIB_bl,AV45_bl,Years_bl,Month_bl,Month,M,update_stamp,label
0,2,011_S_0002,bl,11,ADNI1,ADNI1,2005-09-08,CN,74.3,Male,...,,1.36665,,,0.0,0.0,0,0,2019-02-14 23:58:27.0,0
1,3,011_S_0003,bl,11,ADNI1,ADNI1,2005-09-12,AD,81.3,Male,...,22.83,1.08355,,,0.0,0.0,0,0,2019-02-14 23:58:27.0,1


In [8]:
df_num.shape

(819, 114)

In [9]:
# Merge the two dataframes on the ID column and find out how many entries overlap 

combined = pd.merge(df_img, df_num, how='inner', on='ID')
com_train = pd.merge(df_a_train, df_num, how='inner', on='ID')
com_test =pd.merge(df_a_test, df_num, how='inner', on='ID')

In [10]:
combined.shape

(402, 127)

In [11]:
combined.columns

Index(['Image Data ID', 'ID', 'Group', 'Sex', 'Age', 'Visit', 'Modality',
       'Description', 'Type', 'Acq Date',
       ...
       'PTAU_bl', 'FDG_bl', 'PIB_bl', 'AV45_bl', 'Years_bl', 'Month_bl',
       'Month', 'M', 'update_stamp', 'label_y'],
      dtype='object', length=127)

In [12]:
#For the combined dataframe defined two dataframes for the train and test set 
combined = combined[['ID', 'Sex', 'Age', 'DX', 'Ventricles', 'Hippocampus', 'WholeBrain', 'Entorhinal', "MMSE"]]
com_train = com_train[['ID', 'Sex', 'Age', 'DX', 'Ventricles', 'Hippocampus', 'WholeBrain', 'Entorhinal', "MMSE"]]
com_test = com_test[['ID', 'Sex', 'Age', 'DX', 'Ventricles', 'Hippocampus', 'WholeBrain', 'Entorhinal', "MMSE"]]

In [13]:
combined.head()

Unnamed: 0,ID,Sex,Age,DX,Ventricles,Hippocampus,WholeBrain,Entorhinal,MMSE
0,003_S_1057,F,62,MCI,15196.0,6664.0,958549.0,3527.0,26.0
1,003_S_1059,F,85,Dementia,68842.0,4981.0,824316.0,1789.0,25.0
2,003_S_1074,F,85,MCI,23765.0,5427.0,970745.0,2137.0,28.0
3,003_S_1122,F,77,MCI,18937.0,7695.0,764118.0,3255.0,28.0
4,003_S_1257,M,85,Dementia,132663.0,6372.0,1088690.0,3080.0,20.0


In [14]:
combined.isnull().sum()

ID              0
Sex             0
Age             0
DX              0
Ventricles      6
Hippocampus    70
WholeBrain      4
Entorhinal     70
MMSE            0
dtype: int64

In [16]:
#Create a new column which contains information whether values are imputed or not 
combined['Imputed']=(combined['Hippocampus'].isna() | combined['Entorhinal'].isna())
com_train['Imputed']=(com_train['Hippocampus'].isna() | com_train['Entorhinal'].isna())
com_test['Imputed']=(com_test['Hippocampus'].isna() | com_test['Entorhinal'].isna())
combined

Unnamed: 0,ID,Sex,Age,DX,Ventricles,Hippocampus,WholeBrain,Entorhinal,MMSE,Imputed
0,003_S_1057,F,62,MCI,15196.0,6664.0,958549.0,3527.0,26.0,False
1,003_S_1059,F,85,Dementia,68842.0,4981.0,824316.0,1789.0,25.0,False
2,003_S_1074,F,85,MCI,23765.0,5427.0,970745.0,2137.0,28.0,False
3,003_S_1122,F,77,MCI,18937.0,7695.0,764118.0,3255.0,28.0,False
4,003_S_1257,M,85,Dementia,132663.0,6372.0,1088690.0,3080.0,20.0,False
...,...,...,...,...,...,...,...,...,...,...
397,941_S_1197,F,83,CN,31364.0,6410.0,887612.0,3903.0,30.0,False
398,941_S_1202,M,78,CN,54143.0,5743.0,1052020.0,2398.0,28.0,False
399,941_S_1203,M,84,CN,30573.0,7142.0,903945.0,3059.0,30.0,False
400,941_S_1295,M,77,MCI,66213.0,,1001320.0,,28.0,True


In [18]:
combined["DX"].unique()

array(['MCI', 'Dementia', 'CN'], dtype=object)

In [19]:
#From the DX column, create a new column called label 
#Here the MCI and Dementia are combined 
combined["label"] = ((combined["DX"] == "Dementia") | (combined["DX"] == "MCI") | (combined["DX"] == "AD")).astype(int)
com_train["label"] = ((com_train["DX"] == "Dementia") | (com_train["DX"] == "MCI") | (com_train["DX"] == "AD")).astype(int)
com_test["label"] = ((com_test["DX"] == "Dementia") | (com_test["DX"] == "MCI") | (com_test["DX"] == "AD")).astype(int)

#drop the DX column 
combined.drop(columns=['DX'], axis=1, inplace=True)
com_train.drop(columns=['DX'], axis=1, inplace=True)
com_test.drop(columns=['DX'], axis=1, inplace=True)


In [20]:
#Perform train test split 
X_train =com_train.drop('label', axis=1)
X_test =com_test.drop('label', axis=1)
y_train=com_train['label']
y_test=com_test['label']


In [21]:
#Let us have a look on X_train 
X_train

Unnamed: 0,ID,Sex,Age,Ventricles,Hippocampus,WholeBrain,Entorhinal,MMSE,Imputed
0,137_S_0973,M,77,53920.0,7244.0,1053590.0,4282.0,28.0,False
1,036_S_0945,M,72,39452.0,,1152890.0,,25.0,True
2,041_S_0721,M,66,26273.0,6264.0,916950.0,3180.0,29.0,False
3,137_S_0438,M,82,66806.0,5479.0,797415.0,3200.0,25.0,False
4,011_S_0003,M,81,84599.0,5319.0,1129830.0,1791.0,20.0,False
...,...,...,...,...,...,...,...,...,...
297,035_S_0555,M,77,72975.0,8609.0,1180660.0,3342.0,28.0,False
298,094_S_1188,F,81,33863.0,,963964.0,,28.0,True
299,009_S_0842,M,74,36501.0,7723.0,1112400.0,3442.0,28.0,False
300,011_S_0008,F,85,18757.0,6080.0,948684.0,4190.0,28.0,False


In [22]:
#drop the empty rows in Ventricles and WholeBrain 
X_train.dropna(subset=['Ventricles', 'WholeBrain'],inplace=True, axis=0)
X_test.dropna(subset=['Ventricles', 'WholeBrain'],inplace=True, axis=0)


In [23]:
#Create a new column which specifies if part is in X_train or X_test 
combined['Train']=combined.ID.isin(X_train.ID)
combined

Unnamed: 0,ID,Sex,Age,Ventricles,Hippocampus,WholeBrain,Entorhinal,MMSE,Imputed,label,Train
0,003_S_1057,F,62,15196.0,6664.0,958549.0,3527.0,26.0,False,1,True
1,003_S_1059,F,85,68842.0,4981.0,824316.0,1789.0,25.0,False,1,True
2,003_S_1074,F,85,23765.0,5427.0,970745.0,2137.0,28.0,False,1,True
3,003_S_1122,F,77,18937.0,7695.0,764118.0,3255.0,28.0,False,1,True
4,003_S_1257,M,85,132663.0,6372.0,1088690.0,3080.0,20.0,False,1,False
...,...,...,...,...,...,...,...,...,...,...,...
397,941_S_1197,F,83,31364.0,6410.0,887612.0,3903.0,30.0,False,0,True
398,941_S_1202,M,78,54143.0,5743.0,1052020.0,2398.0,28.0,False,0,True
399,941_S_1203,M,84,30573.0,7142.0,903945.0,3059.0,30.0,False,0,True
400,941_S_1295,M,77,66213.0,,1001320.0,,28.0,True,1,True


In [24]:
X_test.shape

(99, 9)

In [25]:
X_train.shape

(297, 9)

In [26]:
X_train.columns

Index(['ID', 'Sex', 'Age', 'Ventricles', 'Hippocampus', 'WholeBrain',
       'Entorhinal', 'MMSE', 'Imputed'],
      dtype='object')

In [27]:
#Use the KNN Imputer to predict the missing values 

col=['Age', 'Ventricles', 'Hippocampus', 'WholeBrain',
       'Entorhinal', 'MMSE', 'Imputed']
#define imputer
imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')

# fit on the dataset
imputer.fit(X_train.drop(columns=['ID', 'Sex'], axis=1))


# transform X_train und X_test
X_train_imp = pd.DataFrame(imputer.transform(X_train.drop(columns=['ID', 'Sex'], axis=1)), columns=col)
X_test_imp = pd.DataFrame(imputer.transform(X_test.drop(columns=['ID', 'Sex'], axis=1)), columns=col)


In [28]:
#Create a new column for ID and reset the index
X_train_imp["ID"] = X_train.reset_index(drop=True)["ID"]
X_test_imp["ID"] = X_test.reset_index(drop=True)["ID"]

X_train_imp["Sex"] = X_train.reset_index(drop=True)["Sex"]
X_test_imp["Sex"] = X_test.reset_index(drop=True)["Sex"]

In [29]:
X_test = X_test_imp
X_train = X_train_imp

In [30]:
#Convert X_train and X_test into a dataframe 
X_train=pd.DataFrame(X_train)
X_test=pd.DataFrame(X_test)
X_train.isnull().sum()
X_test.isnull().sum()

Age            0
Ventricles     0
Hippocampus    0
WholeBrain     0
Entorhinal     0
MMSE           0
Imputed        0
ID             0
Sex            0
dtype: int64

In [31]:
#Define the train and test labels 
X_train["label"] = y_train.reset_index(drop=True)
X_train["Train"] = True
X_test["label"] = y_test.reset_index(drop=True)
X_test["Train"] = False

In [32]:
X_test

Unnamed: 0,Age,Ventricles,Hippocampus,WholeBrain,Entorhinal,MMSE,Imputed,ID,Sex,label,Train
0,86.0,53980.0,6326.2,946436.0,3577.4,27.0,1.0,029_S_1218,F,1,False
1,83.0,29096.0,5946.0,959442.0,3688.0,25.0,0.0,021_S_0626,M,1,False
2,74.0,16676.0,7513.0,858650.0,3469.0,29.0,0.0,094_S_0489,F,0,False
3,82.0,29415.0,6637.0,790001.0,2817.0,30.0,0.0,016_S_0359,F,0,False
4,75.0,14658.0,7665.0,1021180.0,4248.0,29.0,0.0,031_S_0618,M,0,False
...,...,...,...,...,...,...,...,...,...,...,...
94,77.0,62890.0,4951.0,828140.0,2275.0,25.0,0.0,012_S_0712,M,1,False
95,73.0,39367.0,6146.0,850330.0,3707.0,29.0,0.0,114_S_0416,F,1,False
96,74.0,93791.0,8523.0,1277240.0,4084.0,27.0,0.0,126_S_0865,M,0,False
97,78.0,23532.0,5067.0,1021830.0,2770.0,28.0,0.0,036_S_0673,M,1,False


In [33]:
#Combine X_train and X_test back into one dataframe 
combined= pd.concat([X_train,X_test], axis=0)

In [34]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 396 entries, 0 to 98
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          396 non-null    float64
 1   Ventricles   396 non-null    float64
 2   Hippocampus  396 non-null    float64
 3   WholeBrain   396 non-null    float64
 4   Entorhinal   396 non-null    float64
 5   MMSE         396 non-null    float64
 6   Imputed      396 non-null    float64
 7   ID           396 non-null    object 
 8   Sex          396 non-null    object 
 9   label        396 non-null    int64  
 10  Train        396 non-null    bool   
dtypes: bool(1), float64(7), int64(1), object(2)
memory usage: 34.4+ KB


In [35]:
#Save the combined dataframe 
combined.to_csv('../data/df_ADNI_modelling.csv', index=False)

In [36]:
#Have a look at the final dataframe 
combined.sample(10)

Unnamed: 0,Age,Ventricles,Hippocampus,WholeBrain,Entorhinal,MMSE,Imputed,ID,Sex,label,Train
291,74.0,104036.0,4692.0,950932.0,1558.0,28.0,0.0,033_S_0567,M,1,True
44,72.0,21897.0,8310.0,1040560.0,3623.0,26.0,0.0,011_S_0023,M,0,True
234,77.0,33775.0,7590.0,1069420.0,4381.0,29.0,0.0,018_S_0043,M,1,True
63,80.0,27138.0,5702.0,1050900.0,2423.0,28.0,0.0,033_S_0741,F,1,True
128,70.0,8910.0,7639.0,1001020.0,3991.0,30.0,0.0,131_S_0319,F,1,True
237,84.0,30573.0,7142.0,903945.0,3059.0,30.0,0.0,941_S_1203,M,0,True
55,85.0,42957.0,4300.0,810592.0,2052.0,21.0,0.0,109_S_1157,F,1,True
203,78.0,36709.0,6748.0,940689.0,2835.0,29.0,0.0,024_S_1063,F,0,True
85,70.0,26902.0,7431.0,983400.0,4292.0,29.0,0.0,099_S_0090,M,0,True
224,69.0,101096.0,6899.0,970229.0,3628.0,29.0,0.0,941_S_1311,M,0,True
