# Predict MACE_dur with numerical features from outcome.csv 

In [3]:
%load_ext autoreload
%autoreload 2

import numpy as np
import sklearn
import pandas as pd
import utils

import matplotlib.pyplot as plt
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## load data

In [74]:
ahf2017_df = pd.read_excel('./AHF2017_outcome_anonymous.xls', skiprows=1)

In [140]:
valid_labels = ['BUNV2', 'CrV2', 'eGFR2', 'SodiumV2', 'potassiumV2',
                                'HgbV2', 'Alb', 'UA', 'Tbil', 'proBNP', 'PBNP', 'TNT', 'adiponectin',
                                'Troponin_T', 'nST2', 'pGal3', 'hsCRP', 'Aldosterone', 'nPRA',
                                'nAdiponectin', 'mmp9', 'TIMP1', 'LVOTmm', 'AoRmm', 'LAmm',
                                'IVSdmm', 'LVIDdmm', 'PWdmm', 'LVIDsmm', 'MVEcms', 'MVAcms',
                                'MVEdtmsec', 'EAdurmsec', 'RR2msec', 'IVRTmsec', 'PVS2cms',
                                'PVDcms', 'PVDdtmsec', 'PVAcms', 'AoVTIcm', 'v_4chEDVml',
                                'v_4chESVml', 'bpEDVml', 'bpESVml', 'RSacms', 'REacms', 'RAacms',
                                'LSSacms', 'LSEacms', 'LSAacms', 'LLSacms', 'LLEacms', 'LLAacms', 
                                'OuterDdmm', 'InnerDdmm', 'InnerDsmm', 'IMTmm', 'LVM', 'BSAV2',
                                'LVMI', 'med_Eea', 'mean_Eea', 'EA_ratio', 'RWT', 'EEa', 'age']

X_df = ahf2017_df[valid_labels]
X_df = X_df.apply(pd.to_numeric, errors='coerce') # convert strings which contain #NULL to numerics
X_df = X_df.fillna(X_df.mean()) # fill NaNs with feature's mean

y_df = ahf2017_df[['MACE', 'MACE_dur']]
y_df = y_df.apply(pd.to_numeric, errors='coerce')

# drop row if y contains nan
y_nan_indices = np.where(y_df.MACE_dur.isnull())

X_df = X_df.drop(X_df.index[y_nan_indices])
y_df = y_df.drop(y_df.index[y_nan_indices])

# convert y_df to label
for n_year in range(4):
    # y_df['label_' + str(n_year+1)] = ((1. - y_df.MACE) + ((y_df.MACE_dur - 365.*n_year)/365.).clip(0., 1.)).clip(0., 1.)
    
    y_df['label_' + str(n_year+1)] = ((1. - y_df.MACE) + (y_df.MACE_dur >= 365.*(n_year+1))).clip(0., 1.)

In [141]:
y_df[y_df.MACE == 0]

Unnamed: 0,MACE,MACE_dur,label_1,label_2,label_3,label_4
4,0,2839.0,1.0,1.0,1.0,1.0
6,0,1637.0,1.0,1.0,1.0,1.0
10,0,2099.0,1.0,1.0,1.0,1.0
12,0,2773.0,1.0,1.0,1.0,1.0
17,0,2716.0,1.0,1.0,1.0,1.0
18,0,2719.0,1.0,1.0,1.0,1.0
22,0,2562.0,1.0,1.0,1.0,1.0
25,0,1964.0,1.0,1.0,1.0,1.0
38,0,1480.0,1.0,1.0,1.0,1.0
39,0,27.0,1.0,1.0,1.0,1.0


## data split

In [142]:
from sklearn.model_selection import train_test_split

y_train, y_test = list(), list()
for n_year in range(4):
    X_train, X_test, y_train_tmp, y_test_tmp = train_test_split(X_df, y_df['label_'+str(n_year+1)], random_state=42)
    
    y_train.append(y_train_tmp)
    y_test.append(y_test_tmp)
    
# do normalize
normalizer = sklearn.preprocessing.Normalizer().fit(X_train)
X_train = normalizer.transform(X_train)
X_test = normalizer.transform(X_test)

## Train and Evaluate

In [143]:
def train_and_evaluate(models, eval_func, model_name=None):
    train_losses, test_losses = list(), list()
    
    for n_year, m in zip(range(4), models):
        m = m.fit(X_train, y_train[n_year])
        
        train_losses.append(eval_func(y_train[n_year], m.predict(X_train)))
        test_losses.append(eval_func(y_test[n_year], m.predict(X_test)))
        
    if model_name: print(model_name, 'result:')

    print('\ttraining loss:')
    for n_year in range(4):
        print('\t[{} - {}) year: {:.4f}'.format(n_year, n_year+1, train_losses[n_year]))
    print('\toverall: {:.4f}'.format(sum(train_losses)/4))
        
    print('\ttesting loss:')
    for n_year in range(4):
        print('\t[{} - {}) year: {:.4f}'.format(n_year, n_year+1, test_losses[n_year]))
    print('\toverall: {:.4f}'.format(sum(test_losses)/4))
    
    return train_losses, test_losses

In [144]:
from sklearn import tree, ensemble, linear_model, svm, naive_bayes, neighbors
from sklearn.metrics import mean_absolute_error
import copy

models = [
    ['dtr', tree.DecisionTreeRegressor()],
    ['rfr', ensemble.RandomForestRegressor()],
    ['adabr', ensemble.AdaBoostRegressor()],
    ['lgr', linear_model.LogisticRegression()],
    ['gnb', naive_bayes.GaussianNB()],
    ['knn', neighbors.KNeighborsRegressor()],
    ['svm_lr', svm.LinearSVR()]
]

In [145]:
losses = list()

for name, m in models:
    tmp_models = [copy.deepcopy(m) for _ in range(4)]
    train_and_evaluate(tmp_models, mean_absolute_error, name)

dtr result:
	training loss:
	[0 - 1) year: 0.0000
	[1 - 2) year: 0.0000
	[2 - 3) year: 0.0000
	[3 - 4) year: 0.0000
	overall: 0.0000
	testing loss:
	[0 - 1) year: 0.5645
	[1 - 2) year: 0.4516
	[2 - 3) year: 0.4677
	[3 - 4) year: 0.5161
	overall: 0.5000
rfr result:
	training loss:
	[0 - 1) year: 0.1730
	[1 - 2) year: 0.1681
	[2 - 3) year: 0.1665
	[3 - 4) year: 0.1541
	overall: 0.1654
	testing loss:
	[0 - 1) year: 0.5081
	[1 - 2) year: 0.4645
	[2 - 3) year: 0.4581
	[3 - 4) year: 0.4613
	overall: 0.4730




adabr result:
	training loss:
	[0 - 1) year: 0.2641
	[1 - 2) year: 0.2353
	[2 - 3) year: 0.2748
	[3 - 4) year: 0.2415
	overall: 0.2539
	testing loss:
	[0 - 1) year: 0.4866
	[1 - 2) year: 0.5112
	[2 - 3) year: 0.4939
	[3 - 4) year: 0.4742
	overall: 0.4915
lgr result:
	training loss:
	[0 - 1) year: 0.3730
	[1 - 2) year: 0.4162
	[2 - 3) year: 0.4162
	[3 - 4) year: 0.3784
	overall: 0.3959
	testing loss:
	[0 - 1) year: 0.4355
	[1 - 2) year: 0.3387
	[2 - 3) year: 0.3548
	[3 - 4) year: 0.3065
	overall: 0.3589
gnb result:
	training loss:
	[0 - 1) year: 0.4973
	[1 - 2) year: 0.4000
	[2 - 3) year: 0.4000
	[3 - 4) year: 0.3622
	overall: 0.4149
	testing loss:
	[0 - 1) year: 0.4839
	[1 - 2) year: 0.4032
	[2 - 3) year: 0.3710
	[3 - 4) year: 0.3226
	overall: 0.3952
knn result:
	training loss:
	[0 - 1) year: 0.3459
	[1 - 2) year: 0.3795
	[2 - 3) year: 0.3795
	[3 - 4) year: 0.3611
	overall: 0.3665
	testing loss:
	[0 - 1) year: 0.4871
	[1 - 2) year: 0.5000
	[2 - 3) year: 0.4742
	[3 - 4) year: 0.4355
	ov

