# Model3_multi_collinear

The purpose of this notebook / model experiment is to improve upon the previous fix_imbalance model achieved from model2_fix_imbalance.ipynb, by removing multicollinearity

In [1]:
import pandas as pd
from pycaret.classification import *

## import data

In [2]:
df_ts_agg = pd.read_csv("../proData/df_ts_agg.csv")
df_ts_agg = df_ts_agg.set_index("PATIENT_ID")

df_ts_agg

Unnamed: 0_level_0,ihd,Age,Gender,Height,ICUType,Weight_min,Weight_max,Weight_med,GCS_min,GCS_max,...,Lactate_med,Cholesterol_min,Cholesterol_max,Cholesterol_med,TroponinI_min,TroponinI_max,TroponinI_med,TroponinT_min,TroponinT_max,TroponinT_med
PATIENT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
132539,0,54,female,,Surgical ICU,,,,14.0,15.0,...,,,,,,,,,,
132540,0,76,male,175.3,Cardiac Surgery Recovery Unit,76.0,81.6,80.6,3.0,15.0,...,,,,,,,,,,
132541,0,44,female,,Medical ICU,56.7,56.7,56.7,5.0,8.0,...,1.3,,,,,,,,,
132543,0,68,male,180.3,Medical ICU,84.6,84.6,84.6,14.0,15.0,...,,,,,,,,,,
132545,0,88,female,,Medical ICU,,,,15.0,15.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142665,0,70,female,,Surgical ICU,87.0,87.0,87.0,3.0,15.0,...,2.3,,,,,,,,,
142667,0,25,male,,Medical ICU,166.4,166.4,166.4,15.0,15.0,...,,117.0,117.0,117.0,,,,,,
142670,0,44,male,,Medical ICU,109.0,109.0,109.0,3.0,8.0,...,,,,,,,,,,
142671,1,37,male,,Medical ICU,87.4,87.4,87.4,3.0,7.0,...,1.9,,,,,,,,,


## setup experiment with pycaret

with "remove_multicollinearity" set as true, some linear kernal-based model performance is expected to be improved

In [3]:
exp_physionet = setup(data = df_ts_agg, target = 'ihd', session_id=123,
                      numeric_features = ["GCS_min", "GCS_max"],
                      #normalize = True, 
                      #transformation = True, 
                      #ignore_low_variance = True,
                      remove_multicollinearity = True, multicollinearity_threshold = 0.9,
                      fix_imbalance = True
                     ) 

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,123
1,Target Type,Binary
2,Label Encoded,"0: 0, 1: 1"
3,Original Data,"(4000, 116)"
4,Missing Values,True
5,Numeric Features,110
6,Categorical Features,5
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


## compare common classification models

min(Recall, Precision) is used for comparing, as suggested by the original physionet 2012 challenge

In [4]:
compare_models(turbo = False)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,Light Gradient Boosting Machine,0.866,0.8245,0.3067,0.528,0.3863,0.3175,0.3329,1.9403
1,CatBoost Classifier,0.866,0.8334,0.3146,0.5255,0.3906,0.3217,0.3361,16.4762
2,Extreme Gradient Boosting,0.8639,0.8224,0.3146,0.5142,0.3877,0.3169,0.33,3.6484
3,Extra Trees Classifier,0.8628,0.8181,0.2503,0.5091,0.3344,0.2682,0.2897,0.5912
4,SVM - Radial Kernel,0.8614,0.5031,0.0,0.0,0.0,0.0,0.0,13.3426
5,Gaussian Process Classifier,0.8614,0.5,0.0,0.0,0.0,0.0,0.0,14.2782
6,Gradient Boosting Classifier,0.8596,0.8232,0.3816,0.4951,0.4283,0.3502,0.3554,6.5811
7,Random Forest Classifier,0.84,0.7649,0.2917,0.3957,0.3318,0.2446,0.2498,0.2628
8,Ada Boost Classifier,0.8235,0.7711,0.3684,0.3702,0.3669,0.265,0.2661,1.5173
9,MLP Classifier,0.7867,0.7421,0.4565,0.3368,0.3689,0.2508,0.2639,6.1291


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=123, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

by removing multicollinearity, some linear kernel-based classifier is increased

## create, tune, evaluate, predict gradient boosting classifier

In [11]:
gbc = create_model("gbc")

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8821,0.871,0.5,0.5758,0.5352,0.4681,0.4696
1,0.8571,0.8122,0.4103,0.4848,0.4444,0.3631,0.3648
2,0.85,0.8021,0.359,0.4516,0.4,0.3156,0.3183
3,0.8821,0.8573,0.4103,0.6154,0.4923,0.4286,0.4399
4,0.8429,0.8104,0.4615,0.439,0.45,0.3584,0.3586
5,0.8571,0.8485,0.3077,0.48,0.375,0.2987,0.3081
6,0.8429,0.8002,0.359,0.4242,0.3889,0.2994,0.3008
7,0.8607,0.8117,0.3846,0.5,0.4348,0.3569,0.3609
8,0.8607,0.7986,0.3077,0.5,0.381,0.3075,0.319
9,0.8602,0.8197,0.3158,0.48,0.381,0.3059,0.3145


In [17]:
tuned_gbc = tune_model(gbc, optimize = "F1", choose_better = True)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.875,0.8745,0.4737,0.5455,0.507,0.4359,0.4373
1,0.8464,0.8097,0.5128,0.4545,0.4819,0.3922,0.3931
2,0.8393,0.832,0.4615,0.4286,0.4444,0.3506,0.351
3,0.85,0.8157,0.3846,0.4545,0.4167,0.3313,0.3328
4,0.8179,0.7955,0.3077,0.3333,0.32,0.215,0.2153
5,0.8036,0.8218,0.2821,0.2895,0.2857,0.1719,0.1719
6,0.8286,0.8118,0.4103,0.3902,0.4,0.3001,0.3002
7,0.8286,0.7942,0.3333,0.3714,0.3514,0.2529,0.2534
8,0.8607,0.8066,0.359,0.5,0.4179,0.3412,0.3473
9,0.8638,0.858,0.3684,0.5,0.4242,0.349,0.3542


In [18]:
evaluate_model(tuned_gbc)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [19]:
predict_model(tuned_gbc)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Gradient Boosting Classifier,0.856,0.8217,0.3072,0.4679,0.3709,0.2935,0.3018


Unnamed: 0,Age,Height,Weight_max,GCS_min,GCS_max,GCS_med,HR_min,HR_max,HR_med,NIDiasABP_min,...,ICUType_Cardiac Surgery Recovery Unit,ICUType_Coronary Care Unit,ICUType_Medical ICU,ICUType_Surgical ICU,MechVent_min_0,MechVent_max_1,MechVent_med_0.5,ihd,Label,Score
0,74.0,160.000000,103.00000,3.0,15.0,11.5,79.0,100.0,88.0,30.000000,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0,0,0.0657
1,57.0,169.677895,68.00000,14.0,15.0,15.0,89.0,161.0,109.0,31.000000,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0,0,0.2504
2,90.0,170.200000,69.00000,11.0,15.0,14.0,50.0,113.0,78.5,40.000000,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1,0,0.2276
3,43.0,169.677895,84.04358,15.0,15.0,15.0,77.0,112.0,93.5,49.000000,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0,0,0.0164
4,66.0,169.677895,45.00000,9.0,11.0,10.0,50.0,81.0,62.0,49.000000,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0,0,0.0945
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1196,81.0,157.500000,67.50000,10.0,14.0,12.0,69.0,104.0,81.0,35.000000,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1,0,0.4819
1197,58.0,169.677895,71.60000,14.0,15.0,15.0,57.0,126.0,77.0,40.000000,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0,0.0299
1198,73.0,169.677895,66.00000,14.0,15.0,15.0,104.0,139.0,123.0,33.000000,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1,0,0.1429
1199,19.0,169.677895,90.00000,7.0,14.0,8.0,83.0,121.0,103.0,55.000000,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0,0,0.1690


## explore blend models, both hard and soft voting

In [4]:
blend_hard = blend_models()

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.875,0.0,0.5526,0.5385,0.5455,0.473,0.4731
1,0.8429,0.0,0.4615,0.439,0.45,0.3584,0.3586
2,0.8429,0.0,0.4359,0.4359,0.4359,0.3446,0.3446
3,0.8607,0.0,0.4103,0.5,0.4507,0.3718,0.3742
4,0.8286,0.0,0.4615,0.4,0.4286,0.3283,0.3295
5,0.8393,0.0,0.2821,0.3929,0.3284,0.2399,0.2441
6,0.8357,0.0,0.4359,0.4146,0.425,0.3292,0.3294
7,0.85,0.0,0.3846,0.4545,0.4167,0.3313,0.3328
8,0.8536,0.0,0.3846,0.4688,0.4225,0.3396,0.3418
9,0.853,0.0,0.3158,0.4444,0.3692,0.2888,0.2942


In [5]:
evaluate_model(blend_hard)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [6]:
predict_model(blend_hard)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.8451,0,0.3193,0.4206,0.363,0.2767,0.2801


Unnamed: 0,Age,Height,Weight_max,GCS_min,GCS_max,GCS_med,HR_min,HR_max,HR_med,NIDiasABP_min,...,Gender_not_available,ICUType_Cardiac Surgery Recovery Unit,ICUType_Coronary Care Unit,ICUType_Medical ICU,ICUType_Surgical ICU,MechVent_min_0,MechVent_max_1,MechVent_med_0.5,ihd,Label
0,74.0,160.000000,103.00000,3.0,15.0,11.5,79.0,100.0,88.0,30.000000,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0,0
1,57.0,169.677895,68.00000,14.0,15.0,15.0,89.0,161.0,109.0,31.000000,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0,0
2,90.0,170.200000,69.00000,11.0,15.0,14.0,50.0,113.0,78.5,40.000000,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1,0
3,43.0,169.677895,84.04358,15.0,15.0,15.0,77.0,112.0,93.5,49.000000,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0,0
4,66.0,169.677895,45.00000,9.0,11.0,10.0,50.0,81.0,62.0,49.000000,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1196,81.0,157.500000,67.50000,10.0,14.0,12.0,69.0,104.0,81.0,35.000000,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1,0
1197,58.0,169.677895,71.60000,14.0,15.0,15.0,57.0,126.0,77.0,40.000000,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0
1198,73.0,169.677895,66.00000,14.0,15.0,15.0,104.0,139.0,123.0,33.000000,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1,0
1199,19.0,169.677895,90.00000,7.0,14.0,8.0,83.0,121.0,103.0,55.000000,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0,0


In [7]:
blend_soft = blend_models(method = "soft")

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8607,0.8593,0.5789,0.4889,0.5301,0.449,0.4513
1,0.8536,0.8063,0.4872,0.475,0.481,0.3958,0.3958
2,0.8429,0.8154,0.359,0.4242,0.3889,0.2994,0.3008
3,0.8643,0.851,0.5128,0.5128,0.5128,0.434,0.434
4,0.8321,0.7947,0.4359,0.4048,0.4198,0.3218,0.3221
5,0.85,0.8227,0.3077,0.4444,0.3636,0.2818,0.2879
6,0.8286,0.8013,0.4359,0.3953,0.4146,0.3145,0.315
7,0.8393,0.8027,0.4359,0.425,0.4304,0.3368,0.3369
8,0.8393,0.8426,0.359,0.4118,0.3836,0.2917,0.2926
9,0.853,0.844,0.3684,0.4516,0.4058,0.3229,0.3251


In [8]:
evaluate_model(blend_soft)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [9]:
predict_model(blend_soft)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.8443,0.8171,0.4217,0.4348,0.4281,0.338,0.3381


Unnamed: 0,Age,Height,Weight_max,GCS_min,GCS_max,GCS_med,HR_min,HR_max,HR_med,NIDiasABP_min,...,ICUType_Cardiac Surgery Recovery Unit,ICUType_Coronary Care Unit,ICUType_Medical ICU,ICUType_Surgical ICU,MechVent_min_0,MechVent_max_1,MechVent_med_0.5,ihd,Label,Score
0,74.0,160.000000,103.00000,3.0,15.0,11.5,79.0,100.0,88.0,30.000000,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0,0,0.0804
1,57.0,169.677895,68.00000,14.0,15.0,15.0,89.0,161.0,109.0,31.000000,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0,0,0.3977
2,90.0,170.200000,69.00000,11.0,15.0,14.0,50.0,113.0,78.5,40.000000,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1,1,0.5006
3,43.0,169.677895,84.04358,15.0,15.0,15.0,77.0,112.0,93.5,49.000000,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0,0,0.0926
4,66.0,169.677895,45.00000,9.0,11.0,10.0,50.0,81.0,62.0,49.000000,...,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0,0,0.3470
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1196,81.0,157.500000,67.50000,10.0,14.0,12.0,69.0,104.0,81.0,35.000000,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1,1,0.5070
1197,58.0,169.677895,71.60000,14.0,15.0,15.0,57.0,126.0,77.0,40.000000,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0,0,0.1018
1198,73.0,169.677895,66.00000,14.0,15.0,15.0,104.0,139.0,123.0,33.000000,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1,0,0.2953
1199,19.0,169.677895,90.00000,7.0,14.0,8.0,83.0,121.0,103.0,55.000000,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0,0,0.1935


## conclusion

building upon the imbalance-fixed model, through additional removal of multicollinearity, either a single model, e.g. gradient boosting classifier, or blend models has increased train or test min(recall, precision) to 0.43 or 0.42