# Model6_final

The purpose of this notebook / model experiment is to explore additional approaches to improve model performance

In [1]:
import pandas as pd
from pycaret.classification import *

## import data

In [2]:
df_ts_agg = pd.read_csv("../proData/df_ts_agg.csv")
df_ts_agg = df_ts_agg.set_index("PATIENT_ID")

df_ts_agg

Unnamed: 0_level_0,ihd,Age,Gender,Height,ICUType,Weight_min,Weight_max,Weight_med,GCS_min,GCS_max,...,Lactate_med,Cholesterol_min,Cholesterol_max,Cholesterol_med,TroponinI_min,TroponinI_max,TroponinI_med,TroponinT_min,TroponinT_max,TroponinT_med
PATIENT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
132539,0,54,female,,Surgical ICU,,,,14.0,15.0,...,,,,,,,,,,
132540,0,76,male,175.3,Cardiac Surgery Recovery Unit,76.0,81.6,80.6,3.0,15.0,...,,,,,,,,,,
132541,0,44,female,,Medical ICU,56.7,56.7,56.7,5.0,8.0,...,1.3,,,,,,,,,
132543,0,68,male,180.3,Medical ICU,84.6,84.6,84.6,14.0,15.0,...,,,,,,,,,,
132545,0,88,female,,Medical ICU,,,,15.0,15.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142665,0,70,female,,Surgical ICU,87.0,87.0,87.0,3.0,15.0,...,2.3,,,,,,,,,
142667,0,25,male,,Medical ICU,166.4,166.4,166.4,15.0,15.0,...,,117.0,117.0,117.0,,,,,,
142670,0,44,male,,Medical ICU,109.0,109.0,109.0,3.0,8.0,...,,,,,,,,,,
142671,1,37,male,,Medical ICU,87.4,87.4,87.4,3.0,7.0,...,1.9,,,,,,,,,


## setup experiment with pycaret

with "remove_multicollinearity" set as true, some linear kernal-based model performance is expected to be improved

In [3]:
exp_physionet = setup(data = df_ts_agg, target = 'ihd', session_id=123,
                      categorical_features = ["GCS_min", "GCS_max"],
                      #bin_numeric_features = ["GCS_min", "GCS_max", "GCS_med"],
                      normalize = True, 
                      transformation = True, 
                      #ignore_low_variance = True,
                      remove_multicollinearity = True, multicollinearity_threshold = 0.9,
                      fix_imbalance = True
                     ) 

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,123
1,Target Type,Binary
2,Label Encoded,"0: 0, 1: 1"
3,Original Data,"(4000, 116)"
4,Missing Values,True
5,Numeric Features,108
6,Categorical Features,7
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


## compare common classification models

min(Recall, Precision) is used for comparing, as suggested by the original physionet 2012 challenge

In [None]:
compare_models(turbo = False)

IntProgress(value=0, description='Processing: ', max=206)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,SVM - Radial Kernel,0.8149,0.7767,0.4249,0.3609,0.3895,0.2816,0.2833,8.5899
1,Decision Tree Classifier,0.7964,0.6051,0.3404,0.2959,0.3146,0.1964,0.198,0.2323
2,Logistic Regression,0.7678,0.8024,0.7036,0.3379,0.4562,0.331,0.3674,0.3072
3,SVM - Linear Kernel,0.716,0.0,0.6598,0.2792,0.3919,0.2448,0.2836,0.1123
4,Naive Bayes,0.682,0.7322,0.6733,0.2648,0.3742,0.2197,0.2641,0.0102
5,K Neighbors Classifier,0.6277,0.7039,0.7349,0.234,0.3543,0.1824,0.2416,0.0755


## create, tune, evaluate, predict adaboost model

In [9]:
ada = create_model("ada")

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8643,0.8318,0.6053,0.5,0.5476,0.4686,0.4716
1,0.825,0.7943,0.4615,0.3913,0.4235,0.3212,0.3227
2,0.8286,0.7701,0.4103,0.3902,0.4,0.3001,0.3002
3,0.8429,0.799,0.4615,0.439,0.45,0.3584,0.3586
4,0.8143,0.7766,0.4359,0.3617,0.3953,0.2868,0.2885
5,0.8286,0.7754,0.3333,0.3714,0.3514,0.2529,0.2534
6,0.8357,0.7199,0.4103,0.4103,0.4103,0.3148,0.3148
7,0.8393,0.7727,0.359,0.4118,0.3836,0.2917,0.2926
8,0.8143,0.7371,0.359,0.3415,0.35,0.2417,0.2418
9,0.853,0.8463,0.5526,0.4667,0.506,0.4204,0.4225


In [32]:
tuned_ada = tune_model(ada, optimize = "Accuracy", choose_better = True)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8571,0.8184,0.6053,0.4792,0.5349,0.4518,0.4561
1,0.8429,0.8026,0.5641,0.449,0.5,0.4082,0.4119
2,0.8036,0.7871,0.4615,0.3462,0.3956,0.2812,0.2853
3,0.8607,0.847,0.5385,0.5,0.5185,0.4372,0.4376
4,0.8143,0.79,0.4359,0.3617,0.3953,0.2868,0.2885
5,0.7964,0.804,0.359,0.3043,0.3294,0.2104,0.2114
6,0.8214,0.7786,0.3846,0.3659,0.375,0.2709,0.271
7,0.8429,0.793,0.4359,0.4359,0.4359,0.3446,0.3446
8,0.825,0.7697,0.359,0.3684,0.3636,0.2622,0.2622
9,0.8459,0.8322,0.4737,0.439,0.4557,0.3661,0.3664


In [17]:
evaluate_model(tuned_ada)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [None]:
predict_model(tuned_ada)

## explore blend models, both hard and soft voting

In [None]:
blend_hard = blend_models()

In [None]:
evaluate_model(blend_hard)

In [None]:
predict_model(blend_hard)

In [None]:
blend_hard = blend_models(method = "soft")

In [None]:
evaluate_model(blend_soft)

In [None]:
predict_model(blend_soft)