# Model1_base

The purpose of this notebook / model experiment is to setup a baseline with minimum model configuration.

In [1]:
import pandas as pd
from pycaret.classification import *

## import data

In [2]:
df_ts_agg = pd.read_csv("../proData/df_ts_agg.csv")
df_ts_agg = df_ts_agg.set_index("PATIENT_ID")

df_ts_agg

Unnamed: 0_level_0,ihd,Age,Gender,Height,ICUType,Weight_min,Weight_max,Weight_med,GCS_min,GCS_max,...,Lactate_med,Cholesterol_min,Cholesterol_max,Cholesterol_med,TroponinI_min,TroponinI_max,TroponinI_med,TroponinT_min,TroponinT_max,TroponinT_med
PATIENT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
132539,0,54,female,,Surgical ICU,,,,14.0,15.0,...,,,,,,,,,,
132540,0,76,male,175.3,Cardiac Surgery Recovery Unit,76.0,81.6,80.6,3.0,15.0,...,,,,,,,,,,
132541,0,44,female,,Medical ICU,56.7,56.7,56.7,5.0,8.0,...,1.3,,,,,,,,,
132543,0,68,male,180.3,Medical ICU,84.6,84.6,84.6,14.0,15.0,...,,,,,,,,,,
132545,0,88,female,,Medical ICU,,,,15.0,15.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142665,0,70,female,,Surgical ICU,87.0,87.0,87.0,3.0,15.0,...,2.3,,,,,,,,,
142667,0,25,male,,Medical ICU,166.4,166.4,166.4,15.0,15.0,...,,117.0,117.0,117.0,,,,,,
142670,0,44,male,,Medical ICU,109.0,109.0,109.0,3.0,8.0,...,,,,,,,,,,
142671,1,37,male,,Medical ICU,87.4,87.4,87.4,3.0,7.0,...,1.9,,,,,,,,,


## setup experiment with pycaret

for baseline, minimum setup parameter has been used

In [3]:
exp_physionet = setup(data = df_ts_agg, target = 'ihd', session_id=123,
                      numeric_features = ["GCS_min", "GCS_max"],
                      #normalize = True, 
                      #transformation = True, 
                      #ignore_low_variance = True,
                      #remove_multicollinearity = True, multicollinearity_threshold = 0.9,
                      #fix_imbalance = True
                     ) 

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,123
1,Target Type,Binary
2,Label Encoded,"0: 0, 1: 1"
3,Original Data,"(4000, 116)"
4,Missing Values,True
5,Numeric Features,110
6,Categorical Features,5
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


## compare common classification models

min(Recall, Precision) is used for comparing, as suggested by the original physionet 2012 challenge

In [4]:
compare_models(turbo = False)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,CatBoost Classifier,0.8771,0.8503,0.209,0.6997,0.3169,0.2711,0.333,11.6395
1,Gradient Boosting Classifier,0.8739,0.8391,0.2298,0.6266,0.3308,0.2786,0.3233,2.8791
2,Light Gradient Boosting Machine,0.8717,0.8319,0.2089,0.6318,0.3098,0.2582,0.3086,0.8473
3,Extra Trees Classifier,0.8685,0.8325,0.0776,0.7583,0.1382,0.1161,0.2077,0.2762
4,Ridge Classifier,0.8678,0.0,0.1288,0.6537,0.2115,0.1723,0.2436,0.0343
5,Random Forest Classifier,0.866,0.7706,0.1213,0.5638,0.1973,0.1582,0.2147,0.1153
6,Extreme Gradient Boosting,0.865,0.8391,0.2216,0.5415,0.3114,0.251,0.2834,1.6872
7,Logistic Regression,0.8614,0.7591,0.134,0.4955,0.2074,0.1596,0.202,0.0891
8,SVM - Radial Kernel,0.8614,0.4979,0.0,0.0,0.0,0.0,0.0,4.9327
9,Gaussian Process Classifier,0.8614,0.5,0.0,0.0,0.0,0.0,0.0,3.576


<catboost.core.CatBoostClassifier at 0x2b95c4a13c88>

with this simple model setup, not even fixing imbalanced label, Naive Bayesian base model offers min(recall, precision) = 0.34

## create Naive Bayesian model

In [4]:
nb = create_model("nb")

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8286,0.7802,0.4737,0.3913,0.4286,0.3288,0.3309
1,0.8143,0.7837,0.359,0.3415,0.35,0.2417,0.2418
2,0.8286,0.7988,0.4615,0.4,0.4286,0.3283,0.3295
3,0.8393,0.7849,0.359,0.4118,0.3836,0.2917,0.2926
4,0.7643,0.7272,0.2821,0.2245,0.25,0.1123,0.1133
5,0.7964,0.7896,0.3846,0.3125,0.3448,0.2258,0.2276
6,0.7857,0.6991,0.1538,0.1818,0.1667,0.0447,0.0449
7,0.8429,0.7401,0.4615,0.439,0.45,0.3584,0.3586
8,0.8464,0.7731,0.4359,0.4474,0.4416,0.3525,0.3526
9,0.8029,0.7633,0.2632,0.2703,0.2667,0.1528,0.1528


## tune Naive Bayesian base model

In [14]:
tuned_nb = tune_model(nb, optimize = "AUC", choose_better = True)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8464,0.8088,0.4737,0.439,0.4557,0.3664,0.3668
1,0.8286,0.7716,0.4103,0.3902,0.4,0.3001,0.3002
2,0.8607,0.7944,0.4359,0.5,0.4658,0.3861,0.3873
3,0.8357,0.7846,0.3077,0.3871,0.3429,0.2504,0.2525
4,0.7679,0.7142,0.2564,0.2174,0.2353,0.0995,0.1
5,0.8143,0.7885,0.359,0.3415,0.35,0.2417,0.2418
6,0.8,0.6819,0.1538,0.2069,0.1765,0.0654,0.0664
7,0.85,0.7419,0.4359,0.4595,0.4474,0.3607,0.3608
8,0.85,0.7634,0.3846,0.4545,0.4167,0.3313,0.3328
9,0.8065,0.7672,0.2368,0.2647,0.25,0.1393,0.1396


the tuned Naive Bayesian base model offers a slightly increased min(recall, precision) = 0.35

## evaluate the tuned Naive Bayesian base model

In [6]:
evaluate_model(tuned_nb)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

## predict label on unseen data, using tuned Naive Bayesian base model

In [8]:
predict_model(tuned_nb)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Naive Bayes,0.8185,0.7834,0.3795,0.3539,0.3663,0.2605,0.2607


Unnamed: 0,Age,Height,Weight_min,Weight_max,Weight_med,GCS_min,GCS_max,GCS_med,HR_min,HR_max,...,ICUType_Medical ICU,ICUType_Surgical ICU,MechVent_min_0,MechVent_max_0,MechVent_max_1,MechVent_med_0.0,MechVent_med_0.5,ihd,Label,Score
0,74.0,160.000000,89.300000,103.00000,102.600000,3.0,15.0,11.5,79.0,100.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0,0,0.0000
1,57.0,169.677895,68.000000,68.00000,68.000000,14.0,15.0,15.0,89.0,161.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0,1,0.9992
2,90.0,170.200000,66.900000,69.00000,66.900000,11.0,15.0,14.0,50.0,113.0,...,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1,1,1.0000
3,43.0,169.677895,81.083572,84.04358,83.243402,15.0,15.0,15.0,77.0,112.0,...,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0,0,0.0000
4,66.0,169.677895,45.000000,45.00000,45.000000,9.0,11.0,10.0,50.0,81.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0,0,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1196,81.0,157.500000,66.800000,67.50000,67.500000,10.0,14.0,12.0,69.0,104.0,...,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1,0,0.0000
1197,58.0,169.677895,71.100000,71.60000,71.600000,14.0,15.0,15.0,57.0,126.0,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0,0,0.0000
1198,73.0,169.677895,66.000000,66.00000,66.000000,14.0,15.0,15.0,104.0,139.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1,0,0.0000
1199,19.0,169.677895,90.000000,90.00000,90.000000,7.0,14.0,8.0,83.0,121.0,...,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0,0,0.0000


on the unseen data, min(recall, precision) = 0.35

## explore blend models, both hard and soft voting

In [9]:
blend_hard = blend_models()

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8893,0.0,0.2105,0.8889,0.3404,0.3043,0.4008
1,0.875,0.0,0.1538,0.75,0.2553,0.2183,0.3025
2,0.8857,0.0,0.2564,0.7692,0.3846,0.3386,0.4015
3,0.8857,0.0,0.1795,1.0,0.3043,0.2736,0.3981
4,0.8714,0.0,0.1538,0.6667,0.25,0.2087,0.2776
5,0.8607,0.0,0.0513,0.5,0.093,0.0689,0.1254
6,0.8464,0.0,0.1026,0.3333,0.1569,0.0977,0.1186
7,0.8714,0.0,0.1282,0.7143,0.2174,0.1827,0.2659
8,0.8643,0.0,0.1538,0.5455,0.24,0.1904,0.2372
9,0.871,0.0,0.1316,0.625,0.2174,0.1785,0.2448


In [10]:
blend_soft = blend_models(method = "soft")

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8929,0.8744,0.2895,0.7857,0.4231,0.3776,0.4354
1,0.8821,0.813,0.2051,0.8,0.3265,0.2859,0.3672
2,0.8821,0.8728,0.2564,0.7143,0.3774,0.3279,0.381
3,0.8893,0.8455,0.2051,1.0,0.3404,0.3076,0.4263
4,0.8643,0.779,0.1795,0.5385,0.2692,0.2145,0.2544
5,0.8607,0.8316,0.0769,0.5,0.1333,0.0999,0.1542
6,0.8571,0.7725,0.2051,0.4706,0.2857,0.2197,0.2433
7,0.8857,0.8207,0.2308,0.8182,0.36,0.3182,0.3965
8,0.8714,0.8234,0.1795,0.6364,0.28,0.233,0.2903
9,0.871,0.8207,0.1579,0.6,0.25,0.2049,0.2607


neither blend_hard or blend_soft is performing well, most likely because the majority of models is not performing well with issue of imbalanced label

## conclusion

even with the simplest model, with minimum thoughtful consideration, min(recall, precision) = 0.35 is achieved, surpassing the SAPS-1 score ~0.31 (not apple to apple). 
This promises better model performance upon thoughtful consideration including fix imbalance, remove multicollinearity, normalize/tranform variables, etc.

The following ipynb s address such considerations.