# Model5_improve

The purpose of this notebook / model experiment is to improve upon the previous fix_imbalance model achieved from model2_fix_imbalance.ipynb, by removing multicollinearity and standardizing variables (normalizing + transforming)

In [1]:
import pandas as pd
from pycaret.classification import *

## import data

In [2]:
df_ts_agg = pd.read_csv("../proData/df_ts_agg.csv")
df_ts_agg = df_ts_agg.set_index("PATIENT_ID")

df_ts_agg

Unnamed: 0_level_0,ihd,Age,Gender,Height,ICUType,Weight_min,Weight_max,Weight_med,GCS_min,GCS_max,...,Lactate_med,Cholesterol_min,Cholesterol_max,Cholesterol_med,TroponinI_min,TroponinI_max,TroponinI_med,TroponinT_min,TroponinT_max,TroponinT_med
PATIENT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
132539,0,54,female,,Surgical ICU,,,,14.0,15.0,...,,,,,,,,,,
132540,0,76,male,175.3,Cardiac Surgery Recovery Unit,76.0,81.6,80.6,3.0,15.0,...,,,,,,,,,,
132541,0,44,female,,Medical ICU,56.7,56.7,56.7,5.0,8.0,...,1.3,,,,,,,,,
132543,0,68,male,180.3,Medical ICU,84.6,84.6,84.6,14.0,15.0,...,,,,,,,,,,
132545,0,88,female,,Medical ICU,,,,15.0,15.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142665,0,70,female,,Surgical ICU,87.0,87.0,87.0,3.0,15.0,...,2.3,,,,,,,,,
142667,0,25,male,,Medical ICU,166.4,166.4,166.4,15.0,15.0,...,,117.0,117.0,117.0,,,,,,
142670,0,44,male,,Medical ICU,109.0,109.0,109.0,3.0,8.0,...,,,,,,,,,,
142671,1,37,male,,Medical ICU,87.4,87.4,87.4,3.0,7.0,...,1.9,,,,,,,,,


## setup experiment with pycaret

enable all previously tested improvement parameters
* fix_imbalance
* remove_multicollinearity
* normalize and transformation

In [3]:
exp_physionet = setup(data = df_ts_agg, target = 'ihd', session_id=123,
                      numeric_features = ["GCS_min", "GCS_max"],
                      normalize = True, 
                      transformation = True, 
                      #ignore_low_variance = True,
                      remove_multicollinearity = True, multicollinearity_threshold = 0.9,
                      fix_imbalance = True
                     ) 

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,123
1,Target Type,Binary
2,Label Encoded,"0: 0, 1: 1"
3,Original Data,"(4000, 116)"
4,Missing Values,True
5,Numeric Features,110
6,Categorical Features,5
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


## compare common classification models

min(Recall, Precision) is used for comparing, as suggested by the original physionet 2012 challenge

In [41]:
compare_models(turbo = False)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,CatBoost Classifier,0.8682,0.8343,0.3223,0.5449,0.4027,0.3344,0.3501,18.9342
1,Extra Trees Classifier,0.8678,0.8241,0.2706,0.5542,0.3613,0.2973,0.3223,0.2817
2,Extreme Gradient Boosting,0.8667,0.8328,0.3094,0.5369,0.3894,0.3212,0.338,3.1824
3,Light Gradient Boosting Machine,0.8653,0.8371,0.3225,0.5248,0.3978,0.3271,0.3402,1.8273
4,Gradient Boosting Classifier,0.8496,0.8238,0.3713,0.4517,0.4048,0.3202,0.3236,5.8641
5,Random Forest Classifier,0.8421,0.7671,0.2785,0.4093,0.3299,0.244,0.2507,0.166
6,MLP Classifier,0.8357,0.7477,0.3737,0.4083,0.3866,0.2926,0.2949,7.0789
7,SVM - Radial Kernel,0.8271,0.7755,0.3736,0.3786,0.3755,0.2753,0.2756,7.1535
8,Ada Boost Classifier,0.8231,0.7803,0.4511,0.3815,0.4116,0.3088,0.3113,1.3208
9,Decision Tree Classifier,0.7735,0.5951,0.3482,0.261,0.2976,0.1661,0.1689,0.3177


<catboost.core.CatBoostClassifier at 0x2b3dc1385d30>

## create, tune, evaluate, predict adaboost model

In [4]:
ada = create_model("ada")

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8393,0.8144,0.5263,0.4255,0.4706,0.3771,0.3801
1,0.8107,0.7805,0.4359,0.3542,0.3908,0.2802,0.2823
2,0.8393,0.8085,0.5897,0.4423,0.5055,0.4119,0.418
3,0.8143,0.7743,0.4359,0.3617,0.3953,0.2868,0.2885
4,0.8321,0.8092,0.5641,0.4231,0.4835,0.3857,0.3914
5,0.8214,0.7808,0.4615,0.383,0.4186,0.3142,0.3161
6,0.8036,0.7226,0.4103,0.3333,0.3678,0.253,0.2549
7,0.8357,0.7706,0.3333,0.3939,0.3611,0.2676,0.2688
8,0.8179,0.7426,0.3333,0.3421,0.3377,0.2321,0.2321
9,0.8172,0.7993,0.4211,0.3556,0.3855,0.2791,0.2804


In [10]:
tuned_ada = tune_model(ada, optimize = "Accuracy", choose_better = True)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8607,0.8406,0.4737,0.4865,0.48,0.3996,0.3997
1,0.8429,0.8223,0.3846,0.4286,0.4054,0.3152,0.3158
2,0.8321,0.7705,0.5128,0.4167,0.4598,0.3617,0.3644
3,0.85,0.8432,0.4615,0.4615,0.4615,0.3744,0.3744
4,0.8143,0.8159,0.4359,0.3617,0.3953,0.2868,0.2885
5,0.8107,0.8073,0.3077,0.3158,0.3117,0.202,0.202
6,0.8286,0.7551,0.3846,0.3846,0.3846,0.285,0.285
7,0.85,0.8146,0.4103,0.4571,0.4324,0.3463,0.347
8,0.8214,0.7654,0.3846,0.3659,0.375,0.2709,0.271
9,0.8495,0.8197,0.4474,0.4474,0.4474,0.3602,0.3602


In [11]:
evaluate_model(tuned_ada)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [12]:
predict_model(tuned_ada)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Ada Boost Classifier,0.8376,0.8097,0.3976,0.4099,0.4037,0.3097,0.3098


Unnamed: 0,Age,Height,Weight_max,GCS_min,GCS_max,GCS_med,HR_min,HR_max,HR_med,NIDiasABP_min,...,ICUType_Cardiac Surgery Recovery Unit,ICUType_Coronary Care Unit,ICUType_Medical ICU,ICUType_Surgical ICU,MechVent_min_0,MechVent_max_1,MechVent_med_0.5,ihd,Label,Score
0,0.477499,-1.215611,0.925603,-1.152209,0.672376,-0.511354,0.753785,-0.510431,0.160269,-0.915253,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0,0,0.4866
1,-0.556735,-0.031642,-0.694846,1.181913,0.672376,0.963869,1.427515,1.920192,1.464024,-0.820052,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0,0,0.4939
2,1.769352,0.035305,-0.633653,0.669598,0.672376,0.434389,-1.360105,0.149794,-0.498416,-0.004823,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1,1,0.5040
3,-1.189793,-0.031642,0.178626,1.341102,0.672376,0.963869,0.616506,0.103027,0.517847,0.721668,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0,0,0.4805
4,-0.057864,-0.031642,-2.314832,0.288248,-1.528718,-0.862434,-1.360105,-1.641198,-1.738028,0.721668,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0,0,0.4925
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1196,1.009600,-1.509279,-0.725754,0.483924,-0.551305,-0.364897,0.055875,-0.295979,-0.320230,-0.447804,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1,1,0.5001
1197,-0.505915,-0.031642,-0.478512,1.181913,0.672376,0.963869,-0.827196,0.704247,-0.606748,-0.004823,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0,0.4813
1198,0.406102,-0.031642,-0.819713,1.181913,0.672376,0.963869,2.408744,1.190597,2.268197,-0.632149,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1,0,0.4988
1199,-2.067770,-0.031642,0.439906,-0.142452,-0.551305,-1.206565,1.025611,0.501095,1.106328,1.170347,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0,0,0.4960


## explore blend models, both hard and soft voting

In [13]:
blend_hard = blend_models()

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8536,0.0,0.6842,0.4727,0.5591,0.4748,0.4865
1,0.8214,0.0,0.5385,0.3962,0.4565,0.3526,0.3586
2,0.8107,0.0,0.4615,0.36,0.4045,0.294,0.2972
3,0.85,0.0,0.5641,0.4681,0.5116,0.4239,0.4265
4,0.8321,0.0,0.5897,0.4259,0.4946,0.3971,0.4047
5,0.8286,0.0,0.3333,0.3714,0.3514,0.2529,0.2534
6,0.8393,0.0,0.5897,0.4423,0.5055,0.4119,0.418
7,0.8357,0.0,0.3846,0.4054,0.3947,0.2998,0.2999
8,0.8464,0.0,0.4359,0.4474,0.4416,0.3525,0.3526
9,0.8136,0.0,0.3684,0.3333,0.35,0.2415,0.2419


In [19]:
evaluate_model(blend_hard)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [15]:
predict_model(blend_hard)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.8293,0,0.4398,0.3946,0.416,0.3163,0.317


Unnamed: 0,Age,Height,Weight_max,GCS_min,GCS_max,GCS_med,HR_min,HR_max,HR_med,NIDiasABP_min,...,Gender_not_available,ICUType_Cardiac Surgery Recovery Unit,ICUType_Coronary Care Unit,ICUType_Medical ICU,ICUType_Surgical ICU,MechVent_min_0,MechVent_max_1,MechVent_med_0.5,ihd,Label
0,0.477499,-1.215611,0.925603,-1.152209,0.672376,-0.511354,0.753785,-0.510431,0.160269,-0.915253,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0,0
1,-0.556735,-0.031642,-0.694846,1.181913,0.672376,0.963869,1.427515,1.920192,1.464024,-0.820052,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0,0
2,1.769352,0.035305,-0.633653,0.669598,0.672376,0.434389,-1.360105,0.149794,-0.498416,-0.004823,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1,0
3,-1.189793,-0.031642,0.178626,1.341102,0.672376,0.963869,0.616506,0.103027,0.517847,0.721668,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0,0
4,-0.057864,-0.031642,-2.314832,0.288248,-1.528718,-0.862434,-1.360105,-1.641198,-1.738028,0.721668,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1196,1.009600,-1.509279,-0.725754,0.483924,-0.551305,-0.364897,0.055875,-0.295979,-0.320230,-0.447804,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1,0
1197,-0.505915,-0.031642,-0.478512,1.181913,0.672376,0.963869,-0.827196,0.704247,-0.606748,-0.004823,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0
1198,0.406102,-0.031642,-0.819713,1.181913,0.672376,0.963869,2.408744,1.190597,2.268197,-0.632149,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1,0
1199,-2.067770,-0.031642,0.439906,-0.142452,-0.551305,-1.206565,1.025611,0.501095,1.106328,1.170347,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0,0


In [16]:
blend_soft = blend_models(method = "soft")

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8571,0.8701,0.7105,0.4821,0.5745,0.4924,0.5058
1,0.8179,0.797,0.5385,0.3889,0.4516,0.3458,0.3524
2,0.825,0.8151,0.4359,0.3864,0.4096,0.3074,0.3081
3,0.8536,0.869,0.6923,0.4821,0.5684,0.4836,0.4951
4,0.8107,0.7946,0.5897,0.3833,0.4646,0.3559,0.3681
5,0.8464,0.8301,0.359,0.4375,0.3944,0.3074,0.3094
6,0.8214,0.812,0.5385,0.3962,0.4565,0.3526,0.3586
7,0.8286,0.7904,0.3846,0.3846,0.3846,0.285,0.285
8,0.8286,0.8066,0.4103,0.3902,0.4,0.3001,0.3002
9,0.8208,0.8518,0.4474,0.3696,0.4048,0.3004,0.3023


In [17]:
evaluate_model(blend_soft)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [18]:
predict_model(blend_soft)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.8318,0.8098,0.4699,0.4062,0.4358,0.3375,0.3388


Unnamed: 0,Age,Height,Weight_max,GCS_min,GCS_max,GCS_med,HR_min,HR_max,HR_med,NIDiasABP_min,...,ICUType_Cardiac Surgery Recovery Unit,ICUType_Coronary Care Unit,ICUType_Medical ICU,ICUType_Surgical ICU,MechVent_min_0,MechVent_max_1,MechVent_med_0.5,ihd,Label,Score
0,0.477499,-1.215611,0.925603,-1.152209,0.672376,-0.511354,0.753785,-0.510431,0.160269,-0.915253,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0,0,0.1323
1,-0.556735,-0.031642,-0.694846,1.181913,0.672376,0.963869,1.427515,1.920192,1.464024,-0.820052,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0,0,0.3641
2,1.769352,0.035305,-0.633653,0.669598,0.672376,0.434389,-1.360105,0.149794,-0.498416,-0.004823,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1,0,0.4384
3,-1.189793,-0.031642,0.178626,1.341102,0.672376,0.963869,0.616506,0.103027,0.517847,0.721668,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0,0,0.0470
4,-0.057864,-0.031642,-2.314832,0.288248,-1.528718,-0.862434,-1.360105,-1.641198,-1.738028,0.721668,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0,0,0.3850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1196,1.009600,-1.509279,-0.725754,0.483924,-0.551305,-0.364897,0.055875,-0.295979,-0.320230,-0.447804,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1,1,0.5178
1197,-0.505915,-0.031642,-0.478512,1.181913,0.672376,0.963869,-0.827196,0.704247,-0.606748,-0.004823,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0,0.1164
1198,0.406102,-0.031642,-0.819713,1.181913,0.672376,0.963869,2.408744,1.190597,2.268197,-0.632149,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1,0,0.3089
1199,-2.067770,-0.031642,0.439906,-0.142452,-0.551305,-1.206565,1.025611,0.501095,1.106328,1.170347,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0,0,0.2914


## conclusion

combining all: fixing imbalanced label, multicollinearity removal, normalization / transformation, either a single model, e.g. adaboost, or blend models has increased train or test min(recall, precision) to 0.41