In [2]:
# Benchmark_SVM_Leaderboard.py
#
# Leaderboard submission using linear support vector machine on ADAS13 and VentVol (+ APOE as covariate).
#
# Adapted by Esther Bron, based on script by Vikram Venkatraghavan
# ============
# Date:
# 12 Nov 2017
import os
import pandas as pd
import numpy as np
import sklearn.svm as svm
from pathlib import Path

print("Load data and select features")
# str_exp='C:/Users/Esther/Documents/TADPOLE/scripts/tadpole-bigr/'

# Read in TADPOLE File

tadpoleD1D2File = Path("../data/TADPOLE_D1_D2.csv")
tadpoleLB1LB2File = Path("../data/TADPOLE_LB1_LB2.csv")  # generated from makeLeaderboardDataset.py
intermediate_features_path = Path("IntermediateData/LeaderboardBenchmarkSVMFeaturesTADPOLE.csv")
prediction_output_path = Path("TADPOLE_Submission_Leaderboard_BenchmarkSVM.csv")


def get_patient_list(lb1lb2_df):
    idx_lb2 = lb1lb2_df["LB2"] == 1
    LB2_RID = lb1lb2_df.loc[idx_lb2, "RID"]
    return pd.Series(np.unique(LB2_RID.values))

# mutates d1d2_df
def predict(d1d2_df, lb1lb2_df):
    prepare_features(d1d2_df, lb1lb2_df)


def prepare_features(d1d2_df, lb1lb2_df):
    # d1d2_df = pd.read_csv(tadpoleD1D2File)
    #import ipdb; ipdb.set_trace()
    # Create Diagnosis variable based on DXCHANGE
#     idx_mci = d1d2_df["DXCHANGE"] == 4
#     d1d2_df.loc[idx_mci, "DXCHANGE"] = 2
#     idx_ad = d1d2_df["DXCHANGE"] == 5
#     d1d2_df.loc[idx_ad, "DXCHANGE"] = 3
#     idx_ad = d1d2_df["DXCHANGE"] == 6
#     d1d2_df.loc[idx_ad, "DXCHANGE"] = 3
#     idx_cn = d1d2_df["DXCHANGE"] == 7
#     d1d2_df.loc[idx_cn, "DXCHANGE"] = 1
#     idx_mci = d1d2_df["DXCHANGE"] == 8
#     d1d2_df.loc[idx_mci, "DXCHANGE"] = 2
#     idx_cn = d1d2_df["DXCHANGE"] == 9
#     d1d2_df.loc[idx_cn, "DXCHANGE"] = 1
    d1d2_df.replace({'DXCHANGE': {4: 2, 5: 3, 6: 3, 7: 1, 8: 2, 9: 1}})['DXCHANGE']
    d1d2_df = d1d2_df.rename(columns={"DXCHANGE": "Diagnosis"})
    #h = list(d1d2_df)

    # Select Leaderboard subjects
    # lb1lb2_df = pd.read_csv(tadpoleLB1LB2File)
    #LB = lb1lb2_df["LB1"] + lb1lb2_df["LB2"]
    #idx_lb = LB.values >= 1  # [True, False, True]
    #d1d2_df = d1d2_df[idx_lb]
    
    # above replaced by:
    d1d2_df = d1d2_df[(lb1lb2_df["LB1"] == 1) | (lb1lb2_df["LB2"] == 1)]

    # Select features
    d1d2_df = d1d2_df[
        ["RID", "Diagnosis", "AGE", "ADAS13", "Ventricles", "ICV_bl"]
    ].copy()

    # Force values to numeric
    h = list(d1d2_df)
    for i in range(5, len(h)):
        # print([i])
        if d1d2_df[h[i]].dtype != "float64":
            d1d2_df[h[i]] = pd.to_numeric(d1d2_df[h[i]], errors="coerce")

    # Sort the dataframe based on age for each subject
    # is this necessary? why not:
    d1d2_df = d1d2_df.sort_values(by=['RID']) #[['RID', 'AGE']]
    # urid = np.unique(d1d2_df["RID"].values)
    #Dtadpole_sorted = pd.DataFrame(columns=h)
    #for i in range(len(urid)):
        # print([i])
    #    agei = d1d2_df.loc[d1d2_df["RID"] == urid[i], "AGE"]
    #    idx_sortedi = np.argsort(agei)
    #    D1 = d1d2_df.loc[idx_sortedi.index[idx_sortedi]]
    #    ld = [Dtadpole_sorted, D1]
    #    Dtadpole_sorted = pd.concat(ld)
    #Dtadpole_sorted = Dtadpole_sorted.drop(["AGE"], axis=1)

    # Save dataset
    # Dtadpole_sorted.to_csv(
    #     intermediate_features_path, index=False
    # )

    return d1d2_df

    # Save LB2 RIDs
    # idx_lb2 = lb1lb2_df["LB2"] == 1
    # LB2_RID = lb1lb2_df.loc[idx_lb2, "RID"]
    # SLB2 = pd.Series(np.unique(LB2_RID.values))
    # SLB2.to_csv(Path("IntermediateData/ToPredict.csv"), index=False)


def train():
    # SVM for TADPOLE
    print("Train SVM for Diagnosis and SVR for ADAS and Ventricles")
    # Read Data
    str_in = intermediate_features_path

    D = pd.read_csv(str_in)

    # Correct ventricle volume for ICV
    D["Ventricles_ICV"] = D["Ventricles"].values / D["ICV_bl"].values

    # Get Future Measurements for training prediction
    Y_FutureADAS13_temp = D["ADAS13"].copy()
    Y_FutureADAS13_temp[:] = np.nan
    Y_FutureVentricles_ICV_temp = D["Ventricles_ICV"].copy()
    Y_FutureVentricles_ICV_temp[:] = np.nan
    Y_FutureDiagnosis_temp = D["Diagnosis"].copy()
    Y_FutureDiagnosis_temp[:] = np.nan
    RID = D["RID"].copy()
    uRIDs = np.unique(RID)
    for i in range(len(uRIDs)):
        idx = RID == uRIDs[i]
        idx_copy = np.copy(idx)
        idx_copy[np.where(idx)[-1][-1]] = False
        Y_FutureADAS13_temp[idx_copy] = D.loc[idx, "ADAS13"].values[1:]
        Y_FutureVentricles_ICV_temp[idx_copy] = D.loc[idx, "Ventricles_ICV"].values[1:]
        Y_FutureDiagnosis_temp[idx_copy] = D.loc[idx, "Diagnosis"].values[1:]
    Dtrain = D.drop(["RID", "Diagnosis"], axis=1).copy()

    # Fill nans in feature matrix
    Dtrainmat = Dtrain.as_matrix()
    h = list(Dtrain)
    m = []
    s = []
    for i in range(Dtrainmat.shape[1]):
        m.append(np.nanmean(Dtrainmat[:, i]))
        s.append(np.nanstd(Dtrainmat[:, i]))
        Dtrainmat[np.isnan(Dtrainmat[:, i]), i] = m[i]
        Dtrainmat[:, i] = (Dtrainmat[:, i] - m[i]) / s[i]

    # Remove NaNs in Diagnosis
    idx_last_Diagnosis = np.isnan(Y_FutureDiagnosis_temp)
    RID_Diagnosis = RID.copy()
    Dtrainmat_Diagnosis = Dtrainmat.copy()
    Dtrainmat_Diagnosis = Dtrainmat_Diagnosis[np.logical_not(idx_last_Diagnosis), :]
    RID_Diagnosis = RID_Diagnosis[np.logical_not(idx_last_Diagnosis)]
    Y_FutureDiagnosis = Y_FutureDiagnosis_temp[np.logical_not(idx_last_Diagnosis)].copy()

    # Remove NaNs in ADAS
    idx_last_ADAS13 = np.isnan(Y_FutureADAS13_temp)
    RID_ADAS13 = RID.copy()
    Dtrainmat_ADAS13 = Dtrainmat.copy()
    Dtrainmat_ADAS13 = Dtrainmat_ADAS13[np.logical_not(idx_last_ADAS13), :]
    RID_ADAS13 = RID_ADAS13[np.logical_not(idx_last_ADAS13)]
    Y_FutureADAS13 = Y_FutureADAS13_temp[np.logical_not(idx_last_ADAS13)].copy()

    # Normalise ADAS
    m_FutureADAS13 = np.nanmean(Y_FutureADAS13)
    s_FutureADAS13 = np.nanstd(Y_FutureADAS13)
    Y_FutureADAS13_norm = (Y_FutureADAS13 - m_FutureADAS13) / s_FutureADAS13

    # Remove NaNs in Ventricles
    idx_last_Ventricles_ICV = np.isnan(Y_FutureVentricles_ICV_temp)
    RID_Ventricles_ICV = RID.copy()
    Dtrainmat_Ventricles_ICV = Dtrainmat.copy()
    Dtrainmat_Ventricles_ICV = Dtrainmat_Ventricles_ICV[
                               np.logical_not(idx_last_Ventricles_ICV), :
                               ]
    RID_Ventricles_ICV = RID_Ventricles_ICV[np.logical_not(idx_last_Ventricles_ICV)]
    Y_FutureVentricles_ICV = Y_FutureVentricles_ICV_temp[
        np.logical_not(idx_last_Ventricles_ICV)
    ].copy()

    # Normalise Ventricle values
    m_FutureVentricles_ICV = np.nanmean(Y_FutureVentricles_ICV)
    s_FutureVentricles_ICV = np.nanstd(Y_FutureVentricles_ICV)
    Y_FutureVentricles_ICV_norm = (
                                          Y_FutureVentricles_ICV - m_FutureVentricles_ICV
                                  ) / s_FutureVentricles_ICV

    # Train SVM for diagnosis
    clf = svm.SVC(kernel="linear", probability=True)
    clf.fit(Dtrainmat_Diagnosis, Y_FutureDiagnosis)

    # Train SVR for ADAS
    reg_ADAS13 = svm.SVR(kernel="linear")
    reg_ADAS13.fit(Dtrainmat_ADAS13, Y_FutureADAS13_norm)

    # Train SVR for Ventricles
    reg_Ventricles_ICV = svm.SVR(kernel="linear")
    reg_Ventricles_ICV.fit(Dtrainmat_Ventricles_ICV, Y_FutureVentricles_ICV_norm)

    print("Create test set and do predictions")
    ## Create TestSet
    S = pd.read_csv(Path("IntermediateData/ToPredict.csv"), header=None)
    S = S.values

    Dtestmat = np.zeros((len(S), Dtrainmat.shape[1]))
    for i in range(len(S)):
        idx_S = RID.values == S[i]
        Dtestmat[i, :] = Dtrainmat[np.where(idx_S)[0][-1], :]

    # Test SVM for Diagnosis
    p = clf.predict_proba(Dtestmat)

    # Some defaults for confidence intervals
    CI50_Ventricles_ICV = 0.05
    CI50_ADAS13 = 1

    # Test SVR for ADAS
    y_ADAS13_norm = reg_ADAS13.predict(Dtestmat)
    y_ADAS13 = y_ADAS13_norm * s_FutureADAS13 + m_FutureADAS13
    y_ADAS13_lower = y_ADAS13 - CI50_ADAS13
    y_ADAS13_lower[y_ADAS13_lower < 0] = 0
    y_ADAS13_upper = y_ADAS13 + CI50_ADAS13

    # Test SVR for Ventricles
    y_Ventricles_ICV_norm = reg_Ventricles_ICV.predict(Dtestmat)
    y_Ventricles_ICV = (
            y_Ventricles_ICV_norm * s_FutureVentricles_ICV + m_FutureVentricles_ICV
    )
    y_Ventricles_ICV_lower = y_Ventricles_ICV - CI50_Ventricles_ICV
    y_Ventricles_ICV_lower[y_Ventricles_ICV_lower < 0] = 0
    y_Ventricles_ICV_upper = y_Ventricles_ICV + CI50_Ventricles_ICV

    # Write ouput format to files
    o = np.column_stack(
        (
            S,
            S,
            S,
            p,
            y_ADAS13,
            y_ADAS13_lower,
            y_ADAS13_upper,
            y_Ventricles_ICV,
            y_Ventricles_ICV_lower,
            y_Ventricles_ICV_upper,
        )
    )
    count = 0
    years = [str(a) for a in range(2010, 2018)]
    months = [str(a).zfill(2) for a in range(1, 13)]
    ym = [y + "-" + mo for y in years for mo in months]
    ym = ym[4:-8]
    nr_pred = len(ym)
    o1 = np.zeros((o.shape[0] * nr_pred, o.shape[1]))
    ym1 = [a for b in range(0, len(S)) for a in ym]
    for i in range(len(o)):
        o1[count: count + nr_pred] = o[i]
        o1[count: count + nr_pred, 1] = range(1, nr_pred + 1)
        count = count + nr_pred

    output = pd.DataFrame(
        o1,
        columns=[
            "RID",
            "Forecast Month",
            "Forecast Date",
            "CN relative probability",
            "MCI relative probability",
            "AD relative probability",
            "ADAS13",
            "ADAS13 50% CI lower",
            "ADAS13 50% CI upper",
            "Ventricles_ICV",
            "Ventricles_ICV 50% CI lower",
            "Ventricles_ICV 50% CI upper",
        ],
    )
    output["Forecast Month"] = output["Forecast Month"].astype(int)
    output["Forecast Date"] = ym1

    output.to_csv(
        prediction_output_path, header=True, index=False
    )

Load data and select features


In [7]:
df = prepare_features()

  if (yield from self.run_code(code, result)):


Unnamed: 0,RID,Diagnosis,ADAS13,Ventricles,ICV_bl
22,8,1.0,7.0,18757.0,1396070.0
23,8,1.0,10.67,18261.0,1396070.0
5741,8,1.0,5.0,,1396070.0
5742,8,1.0,9.33,,1396070.0
5743,8,1.0,17.0,,1396070.0


In [10]:
lb1lb2_df = pd.read_csv(Path("../data/TADPOLE_LB1_LB2.csv"))
d1d1_df = pd.read_csv(Path("../data/TADPOLE_D1_D2.csv"))
df = prepare_features(d1d1_df, lb1lb2_df)

  interactivity=interactivity, compiler=compiler, result=result)


In [11]:
#df.fillna('')

Unnamed: 0,RID,Diagnosis,AGE,ADAS13,Ventricles,ICV_bl
0,2,1,74.3,18.67,118233,1.98466e+06
11959,2,1,74.3,19,,1.98466e+06
5723,2,1,74.3,19.67,,1.98466e+06
5724,2,1,74.3,20,,1.98466e+06
5725,2,1,74.3,23,,1.98466e+06
5726,2,,74.3,,,1.98466e+06
5727,2,1,74.3,21,,1.98466e+06
5728,2,,74.3,,,1.98466e+06
5729,2,4,74.3,14,,1.98466e+06
5730,2,,74.3,,,1.98466e+06


In [None]:
d1d2_df = pd.read_csv(Path("../data/TADPOLE_D1_D2.csv"))

In [91]:


#d1d2_df.replace({'DXCHANGE': {4: 2, 5: 3, 6: 3, 7: 1, 8: 2, 9: 1}})['DXCHANGE']
d1d2_df['DXCHANGE']
#set(list(zip(d1d2_df['DX'], d1d2_df['DXCHANGE'])))

0        1.0
1        3.0
2        3.0
3        3.0
4        3.0
5        2.0
6        2.0
7        2.0
8        2.0
9        2.0
10       1.0
11       1.0
12       1.0
13       1.0
14       1.0
15       2.0
16       2.0
17       2.0
18       2.0
19       2.0
20       3.0
21       3.0
22       1.0
23       1.0
24       3.0
25       3.0
26       3.0
27       3.0
28       1.0
29       1.0
        ... 
12711    5.0
12712    2.0
12713    2.0
12714    NaN
12715    NaN
12716    2.0
12717    2.0
12718    NaN
12719    5.0
12720    NaN
12721    NaN
12722    1.0
12723    2.0
12724    2.0
12725    8.0
12726    1.0
12727    4.0
12728    5.0
12729    NaN
12730    2.0
12731    1.0
12732    NaN
12733    2.0
12734    1.0
12735    NaN
12736    NaN
12737    NaN
12738    NaN
12739    NaN
12740    NaN
Name: DXCHANGE, Length: 12741, dtype: float64

AttributeError: 'dict' object has no attribute 'reset_index'

In [36]:
d1d2_df.sort_values(by=['AGE', 'RID'])[['RID', 'AGE']]


Unnamed: 0,RID,AGE
1062,445,54.4
3557,4114,55.0
3558,4114,55.0
3559,4114,55.0
5089,4114,55.0
9577,4114,55.0
9578,4114,55.0
11312,4114,55.0
12071,4114,55.0
12643,4114,55.0


In [38]:
idx_mci = d1d2_df["DXCHANGE"] == 4
d1d2_df.loc[idx_mci, "DXCHANGE"] = 2
idx_ad = d1d2_df["DXCHANGE"] == 5
d1d2_df.loc[idx_ad, "DXCHANGE"] = 3
idx_ad = d1d2_df["DXCHANGE"] == 6
d1d2_df.loc[idx_ad, "DXCHANGE"] = 3
idx_cn = d1d2_df["DXCHANGE"] == 7
d1d2_df.loc[idx_cn, "DXCHANGE"] = 1
idx_mci = d1d2_df["DXCHANGE"] == 8
d1d2_df.loc[idx_mci, "DXCHANGE"] = 2
idx_cn = d1d2_df["DXCHANGE"] == 9
d1d2_df.loc[idx_cn, "DXCHANGE"] = 1
d1d2_df = d1d2_df.rename(columns={"DXCHANGE": "Diagnosis"})
h = list(d1d2_df)

# Select Leaderboard subjects
# lb1lb2_df = pd.read_csv(tadpoleLB1LB2File)
LB = lb1lb2_df["LB1"] + lb1lb2_df["LB2"]
idx_lb = LB.values >= 1
d1d2_df = d1d2_df[idx_lb]

# Select features
d1d2_df = d1d2_df[
    ["RID", "Diagnosis", "AGE", "ADAS13", "Ventricles", "ICV_bl"]
].copy()
h = list(d1d2_df)
for i in range(5, len(h)):
    # print([i])
    if d1d2_df[h[i]].dtype != "float64":
        d1d2_df[h[i]] = pd.to_numeric(d1d2_df[h[i]], errors="coerce")

# Sort the dataframe based on age for each subject
urid = np.unique(d1d2_df["RID"].values)
Dtadpole_sorted = pd.DataFrame(columns=h)
for i in range(len(urid)):
    # print([i])
    agei = d1d2_df.loc[d1d2_df["RID"] == urid[i], "AGE"]
    idx_sortedi = np.argsort(agei)
    D1 = d1d2_df.loc[idx_sortedi.index[idx_sortedi]]
    ld = [Dtadpole_sorted, D1]
    Dtadpole_sorted = pd.concat(ld)
Dtadpole_sorted = Dtadpole_sorted.drop(["AGE"], axis=1)

In [39]:
Dtadpole_sorted

Unnamed: 0,RID,Diagnosis,ADAS13,Ventricles,ICV_bl
0,2,1.0,18.67,118233.0,1984660.0
5723,2,1.0,19.67,,1984660.0
5724,2,1.0,20.00,,1984660.0
5725,2,1.0,23.00,,1984660.0
5726,2,,,,1984660.0
5727,2,1.0,21.00,,1984660.0
5728,2,,,,1984660.0
5729,2,2.0,14.00,,1984660.0
5730,2,,,,1984660.0
5731,2,1.0,18.00,,1984660.0


In [40]:
np

<module 'numpy' from '/home/tom/miniconda3/lib/python3.7/site-packages/numpy/__init__.py'>

In [106]:
a = np.array([1,3,2])

In [109]:
a[[True, False, True]]

array([1, 2])