# PyCon UK: Alzheimer's Disease Challenge Hackathon
### LB1: TADPOLE Standard training set.

This training dataset contains medical data including:
* MRI scans
* PET scans
* DTI scans
* Cognitive assessment data
* Demographic data
* Genetic data
* CSF data

For more details on this dataset including a full data dictionary please see https://github.com/swhustla/pycon2017-alzheimers-hack/tree/master/docs

### LB2: TADPOLE Standard prediction set.

`LB1` subjects and `LB2` subjects combine to form a training dataset, however `LB2` subjects are those for which predictions should be made in the final submission.

See the github readme file ["https://github.com/swhustla/pycon2017-alzheimers-hack/blob/master/README.md"] for more information and explanations on the data sources.

In [None]:
from dateutil import rrule
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
import seaborn as sns
from sklearn import model_selection, preprocessing, pipeline

%matplotlib inline
pd.set_option('display.max_columns', None)

In [None]:
# Load TADPOLE data
DATA_DIR = Path('../data')
tadpole = pd.read_csv(DATA_DIR / 'TADPOLE_LB1_LB2.csv', low_memory=False)
tadpole.head()

In [None]:
# There are >1900 features...
tadpole.columns

In [None]:
tadpole.columns[:10]

In [None]:
outcomes = ["ADAS13", "DX", "Ventricles"]
cog_tests_attributes = ["CDRSB", "ADAS11", "MMSE", "RAVLT_immediate"]
mri_measures = ['Hippocampus', 'WholeBrain', 'Entorhinal', 'MidTemp' , "FDG", "AV45"]
pet_measures = ["FDG", "AV45"]
csf_measures = ["ABETA_UPENNBIOMK9_04_19_17", "TAU_UPENNBIOMK9_04_19_17", "PTAU_UPENNBIOMK9_04_19_17"]
risk_factors = ["APOE4", "AGE"]

In [None]:
# Add age at exam
tadpole.EXAMDATE = pd.to_datetime(tadpole.EXAMDATE)
tadpole_grouped = tadpole.groupby("RID").apply(lambda x:(x["EXAMDATE"]-x["EXAMDATE"].min()).dt.days/365.25 + x["AGE"].min())
tadpole_grouped.sort_index(inplace=True)
tadpole.sort_values(by=["RID", "EXAMDATE"], inplace=True)
tadpole["AGE_AT_EXAM"] = tadpole_grouped.values
tadpole['AGE_INT'] = tadpole['AGE_AT_EXAM'].apply(int)

In [None]:
tadpole[tadpole['ADAS13'].notnull()]\
    .groupby('AGE_INT')['ADAS13']\
    .count().plot()

In [None]:
tadpole[tadpole['ADAS13'].notnull()]\
    .groupby('AGE_INT')['ADAS13']\
    .mean().plot()

In [None]:
tadpole[tadpole.RID==259].plot(kind="scatter", x="AGE_AT_EXAM", y="ADAS13")
plt.show()

# Attempt #1: Jitter the CSV!

In [None]:
submission = pd.read_csv(DATA_DIR / 'TADPOLE_Submission_Pycon_Dummy.csv')

In [None]:
submission.head()

In [None]:
pred_cols = submission.columns[3:]

In [None]:
submission.to_csv(DATA_DIR / 'TADPOLE_Submission_Pycon_NAME.csv', index=False)

In [None]:
submission.columns

# Attempt #2: Last seen non-missing value

In [None]:
dx_map = {
        'MCI': 'MCI',
        'NL': 'CN',
        'Dementia': 'AD',
        'MCI to Dementia': 'AD',
        'NL to MCI': 'MCI',
        'MCI to NL': 'CN',
        'Dementia to MCI': 'MCI',
        'NL to Dementia': 'AD'
    }
tadpole['diagnosis'] = tadpole['DX'].map(dx_map)

In [None]:
rids = submission['RID'].unique()

In [None]:
for rid in rids:
    train_rid = tadpole[tadpole['RID'] == rid]
    diagnosis = train_rid['diagnosis'].dropna().iloc[-1]
    adas13 = train_rid['ADAS13'].dropna().iloc[-1]
    ventricles = train_rid['Ventricles'].dropna().iloc[-1]
    submission.loc[submission['RID'] == rid, 'CN relative probability'] = int(diagnosis == 'CN')
    submission.loc[submission['RID'] == rid, 'MCI relative probability'] = int(diagnosis == 'MCI')
    submission.loc[submission['RID'] == rid, 'AD relative probability'] = int(diagnosis == 'AD')
    submission.loc[submission['RID'] == rid, 'ADAS13'] = adas13
    submission.loc[submission['RID'] == rid, 'ADAS13 50% CI lower'] = adas13 - 10
    submission.loc[submission['RID'] == rid, 'ADAS13 50% CI upper'] = adas13 + 10
    submission.loc[submission['RID'] == rid, 'Ventricles_ICV'] = ventricles
    submission.loc[submission['RID'] == rid, 'Ventricles_ICV 50% CI lower'] = ventricles - .5
    submission.loc[submission['RID'] == rid, 'Ventricles_ICV 50% CI upper'] = ventricles + .5

In [None]:
submission.head()

In [None]:
submission.to_csv(DATA_DIR / 'TADPOLE_Submission_Pycon_NAME.csv', index=False)

# Create X, y datasets

In [None]:
X_cols = [
    # cog_tests_attributes
    "CDRSB", "ADAS11", "MMSE", "RAVLT_immediate",
    # mri_measures
    'Hippocampus', 'WholeBrain', 'Entorhinal', 'MidTemp' , "FDG", "AV45",
    # pet_measures
    "FDG", "AV45",
    # csf_measures
    "ABETA_UPENNBIOMK9_04_19_17", "TAU_UPENNBIOMK9_04_19_17", "PTAU_UPENNBIOMK9_04_19_17",
    # risk_factors
    "APOE4", "AGE",
    # age
    'AGE_AT_EXAM',
]

X = tadpole[X_cols]
y_diag = tadpole['diagnosis']
y_adas = tadpole['ADAS13']
y_vent = tadpole['Ventricles']