# Demographic Data
## Features
- MCI     = Is this a Mild cognitive impairment case?
- AD      = Is this an Alzheimer's Disease case?
- Gender  = 1 Female, 0 Male
- Age     = in years
- Exclude = If 1, this case has issues in brain imaging (do not use)
- Height  = in cm
- Weight  = in kg
## Summary
### ADNI
- 4459 records, 8 features
- Gender, height, weight completely missing so dropped from dataset
- No patients to exclude from dataset
- Dropped exclude column
- Patients with AD: 1019, Patients without AD: 3440
### Sheffield
- 104 records, 8 features
- 3 patients to exclude, dropped from dataframe
- Dropped exclude column
- Patients with AD: 32, Patients without AD: 69

In [22]:
import numpy as np
import pandas as pd

# ADNI DATA

In [23]:
"""
Check for NAN values
"""
df = pd.read_csv('../datasets/adni_data/ADNI_Demo.csv')
print(df.shape)
df.isnull().sum().sort_values(ascending=True)

(4459, 8)


ID            0
MCI           0
AD            0
Age           0
Exclude       0
Gender     4459
Height     4459
Weight     4459
dtype: int64

In [24]:
"""
Gender, Height and Weight rows removed from data frame
"""
df = df.drop(['Gender', 'Height', 'Weight'], axis=1)
print(df)

           ID  MCI  AD        Age  Exclude
0       183bl    0   1  72.400000        0
1      183m12    0   1  73.396578        0
2      183m24    0   1  74.415058        0
3       241bl    1   0  81.800000        0
4      241m06    0   1  82.301027        0
...       ...  ...  ..        ...      ...
4454   5109bl    0   0  78.400000        0
4455   4959bl    0   1  77.500000        0
4456  2376m12    1   0  82.902053        0
4457   2376bl    1   0  81.900000        0
4458   5265bl    0   0  75.100000        0

[4459 rows x 5 columns]


In [25]:
"""
Drop anywhere a row should be excluded

There are no patient records to exclude
"""
exclude_rows = df[df['Exclude'] == 1]
print("Patients excluded: " + str(exclude_rows.shape[0]))

df = df.drop(df.loc[df['Exclude'] == 1].index)
df

Patients excluded: 0


Unnamed: 0,ID,MCI,AD,Age,Exclude
0,183bl,0,1,72.400000,0
1,183m12,0,1,73.396578,0
2,183m24,0,1,74.415058,0
3,241bl,1,0,81.800000,0
4,241m06,0,1,82.301027,0
...,...,...,...,...,...
4454,5109bl,0,0,78.400000,0
4455,4959bl,0,1,77.500000,0
4456,2376m12,1,0,82.902053,0
4457,2376bl,1,0,81.900000,0


In [26]:
# Check how many patients have AD/don't have AD
have_ad = df[df["AD"] == 1].count()[0]
print(f"Patients with AD: {have_ad}")
no_ad = df[df["AD"] == 0].count()[0]
print(f"Patients without AD: {no_ad}")
print(f"Total patients: {no_ad+have_ad}")

Patients with AD: 1019
Patients without AD: 3440
Total patients: 4459


In [27]:
"""
Drop the exclude column
"""
df = df.drop(['Exclude'], axis=1)
df

Unnamed: 0,ID,MCI,AD,Age
0,183bl,0,1,72.400000
1,183m12,0,1,73.396578
2,183m24,0,1,74.415058
3,241bl,1,0,81.800000
4,241m06,0,1,82.301027
...,...,...,...,...
4454,5109bl,0,0,78.400000
4455,4959bl,0,1,77.500000
4456,2376m12,1,0,82.902053
4457,2376bl,1,0,81.900000


In [32]:
# Write to new preprocessed file
df.to_csv('../preprocessed_data/ADNI_DEMO.csv', encoding='utf-8', index=False)

# SHEFFIELD DATA

In [28]:
"""
Check for NAN values
"""
df = pd.read_csv('../datasets/sheffield_data/SHEF_Demo.csv')
print(df.shape)
df.isnull().sum().sort_values(ascending=True)

(104, 8)


ID         0
MCI        0
AD         0
Gender     0
Age        0
Exclude    0
Height     2
Weight     2
dtype: int64

In [29]:
"""
Drop anywhere a row should be excluded

There are no patient records to exclude
"""
exclude_rows = df[df['Exclude'] == 1]
print("Patients excluded: " + str(exclude_rows.shape[0]))

df = df.drop(df.loc[df['Exclude'] == 1].index)
df

Patients excluded: 3


Unnamed: 0,ID,MCI,AD,Gender,Age,Exclude,Height,Weight
0,SH_DARE_G1_001,0,1,1,66,0,164.0,69.0
1,SH_DARE_G1_002,0,1,1,57,0,171.0,72.0
2,SH_DARE_G1_003,0,1,0,55,0,172.0,80.0
3,SH_DARE_G1_004,0,1,0,55,0,172.0,85.0
4,SH_DARE_G1_005,0,1,0,62,0,170.0,82.0
...,...,...,...,...,...,...,...,...
99,SH_DARE_G3_024,1,0,0,64,0,172.5,85.0
100,SH_DARE_G3_025,1,0,0,51,0,177.0,97.2
101,SH_DARE_G3_026,1,0,1,77,0,161.0,61.4
102,SH_DARE_G3_027,1,0,0,62,0,165.0,63.0


In [30]:
"""
Drop the exclude column
"""
df = df.drop(['Exclude'], axis=1)
df

Unnamed: 0,ID,MCI,AD,Gender,Age,Height,Weight
0,SH_DARE_G1_001,0,1,1,66,164.0,69.0
1,SH_DARE_G1_002,0,1,1,57,171.0,72.0
2,SH_DARE_G1_003,0,1,0,55,172.0,80.0
3,SH_DARE_G1_004,0,1,0,55,172.0,85.0
4,SH_DARE_G1_005,0,1,0,62,170.0,82.0
...,...,...,...,...,...,...,...
99,SH_DARE_G3_024,1,0,0,64,172.5,85.0
100,SH_DARE_G3_025,1,0,0,51,177.0,97.2
101,SH_DARE_G3_026,1,0,1,77,161.0,61.4
102,SH_DARE_G3_027,1,0,0,62,165.0,63.0


In [31]:
# Check how many patients have AD/don't have AD
have_ad = df[df["AD"] == 1].count()[0]
print(f"Patients with AD: {have_ad}")
no_ad = df[df["AD"] == 0].count()[0]
print(f"Patients without AD: {no_ad}")
print(f"Total patients: {no_ad+have_ad}")

Patients with AD: 32
Patients without AD: 69
Total patients: 101
