# Predict MACE_dur with numerical features from outcome.csv 

In [3]:
%load_ext autoreload
%autoreload 2

import numpy as np
import sklearn
import pandas as pd
import utils

import matplotlib.pyplot as plt
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## load data

In [74]:
ahf2017_df = pd.read_excel('./AHF2017_outcome_anonymous.xls', skiprows=1)

In [111]:
valid_labels = ['BUNV2', 'CrV2', 'eGFR2', 'SodiumV2', 'potassiumV2',
                                'HgbV2', 'Alb', 'UA', 'Tbil', 'proBNP', 'PBNP', 'TNT', 'adiponectin',
                                'Troponin_T', 'nST2', 'pGal3', 'hsCRP', 'Aldosterone', 'nPRA',
                                'nAdiponectin', 'mmp9', 'TIMP1', 'LVOTmm', 'AoRmm', 'LAmm',
                                'IVSdmm', 'LVIDdmm', 'PWdmm', 'LVIDsmm', 'MVEcms', 'MVAcms',
                                'MVEdtmsec', 'EAdurmsec', 'RR2msec', 'IVRTmsec', 'PVS2cms',
                                'PVDcms', 'PVDdtmsec', 'PVAcms', 'AoVTIcm', 'v_4chEDVml',
                                'v_4chESVml', 'bpEDVml', 'bpESVml', 'RSacms', 'REacms', 'RAacms',
                                'LSSacms', 'LSEacms', 'LSAacms', 'LLSacms', 'LLEacms', 'LLAacms', 
                                'OuterDdmm', 'InnerDdmm', 'InnerDsmm', 'IMTmm', 'LVM', 'BSAV2',
                                'LVMI', 'med_Eea', 'mean_Eea', 'EA_ratio', 'RWT', 'EEa', 'age']

X_df = ahf2017_df[valid_labels]
X_df = X_df.apply(pd.to_numeric, errors='coerce') # convert strings which contain #NULL to numerics
X_df = X_df.fillna(X_df.mean()) # fill NaNs with feature's mean

y_df = ahf2017_df[['MACE', 'MACE_dur']]
y_df = y_df.apply(pd.to_numeric, errors='coerce')

# drop row if y contains nan
y_nan_indices = np.where(y_df.MACE_dur.isnull())

X_df = X_df.drop(X_df.index[y_nan_indices])
y_df = y_df.drop(y_df.index[y_nan_indices])

# convert y_df to label
for n_year in range(4):
    # y_df['label_' + str(n_year+1)] = ((1. - y_df.MACE) + ((y_df.MACE_dur - 365.*n_year)/365.).clip(0., 1.)).clip(0., 1.)
    
    y_df['label_' + str(n_year+1)] = ((1. - y_df.MACE) + (y_df.MACE_dur >= 365.*(n_year+1))).clip(0., 1.)

In [112]:
y_df[y_df.MACE == 0]

Unnamed: 0,MACE,MACE_dur,label_1,label_2,label_3,label_4
4,0,2839.0,1.0,1.0,1.0,1.0
6,0,1637.0,1.0,1.0,1.0,1.0
10,0,2099.0,1.0,1.0,1.0,1.0
12,0,2773.0,1.0,1.0,1.0,1.0
17,0,2716.0,1.0,1.0,1.0,1.0
18,0,2719.0,1.0,1.0,1.0,1.0
22,0,2562.0,1.0,1.0,1.0,1.0
25,0,1964.0,1.0,1.0,1.0,1.0
38,0,1480.0,1.0,1.0,1.0,1.0
39,0,27.0,1.0,1.0,1.0,1.0


## data split

In [113]:
from sklearn.model_selection import train_test_split

y_train, y_test = list(), list()
for n_year in range(4):
    X_train, X_test, y_train_tmp, y_test_tmp = train_test_split(X_df, y_df['label_'+str(n_year+1)], random_state=42)
    
    y_train.append(y_train_tmp)
    y_test.append(y_test_tmp)

## Train and Evaluate

In [114]:
def train_and_evaluate(models, eval_func, model_name=None):
    train_losses, test_losses = list(), list()
    
    for n_year, m in zip(range(4), models):
        m = m.fit(X_train, y_train[n_year])
        
        train_losses.append(eval_func(y_train[n_year], m.predict(X_train)))
        test_losses.append(eval_func(y_test[n_year], m.predict(X_test)))
        
    if model_name: print(model_name, 'result:')

    print('\ttraining loss:')
    for n_year in range(4):
        print('\t[{} - {}) year: {:.4f}'.format(n_year, n_year+1, train_losses[n_year]))
        
    print('\ttesting loss:')
    for n_year in range(4):
        print('\t[{} - {}) year: {:.4f}'.format(n_year, n_year+1, test_losses[n_year]))
    return train_losses, test_losses

In [115]:
from sklearn import tree, ensemble, linear_model, svm
from sklearn.metrics import mean_absolute_error
import copy

models = [
    ['dtr', tree.DecisionTreeClassifier()],
    ['rf', ensemble.RandomForestClassifier()],
    ['adab', ensemble.AdaBoostClassifier()],
    ['lgr', linear_model.LogisticRegression()],
    ['svm_lc', svm.LinearSVC()]
]

In [116]:
losses = list()

for name, m in models:
    tmp_models = [copy.deepcopy(m) for _ in range(4)]
    train_and_evaluate(tmp_models, mean_absolute_error, name)

dtr result:
	training loss:
	[0 - 1) year: 0.0000
	[1 - 2) year: 0.0000
	[2 - 3) year: 0.0000
	[3 - 4) year: 0.0000
	testing loss:
	[0 - 1) year: 0.5161
	[1 - 2) year: 0.4032
	[2 - 3) year: 0.5323
	[3 - 4) year: 0.5161
rf result:
	training loss:
	[0 - 1) year: 0.0108
	[1 - 2) year: 0.0000
	[2 - 3) year: 0.0216
	[3 - 4) year: 0.0432
	testing loss:
	[0 - 1) year: 0.4194
	[1 - 2) year: 0.3871
	[2 - 3) year: 0.3226
	[3 - 4) year: 0.3871




adab result:
	training loss:
	[0 - 1) year: 0.0000
	[1 - 2) year: 0.0000
	[2 - 3) year: 0.0000
	[3 - 4) year: 0.0000
	testing loss:
	[0 - 1) year: 0.4355
	[1 - 2) year: 0.5000
	[2 - 3) year: 0.5484
	[3 - 4) year: 0.5161
lgr result:
	training loss:
	[0 - 1) year: 0.1622
	[1 - 2) year: 0.1622
	[2 - 3) year: 0.1838
	[3 - 4) year: 0.1676
	testing loss:
	[0 - 1) year: 0.4355
	[1 - 2) year: 0.4355
	[2 - 3) year: 0.4032
	[3 - 4) year: 0.4355




svm_lc result:
	training loss:
	[0 - 1) year: 0.4162
	[1 - 2) year: 0.4108
	[2 - 3) year: 0.3676
	[3 - 4) year: 0.3351
	testing loss:
	[0 - 1) year: 0.4839
	[1 - 2) year: 0.4516
	[2 - 3) year: 0.4032
	[3 - 4) year: 0.3065


