In [7]:
!pip install xgboost

Collecting xgboost
  Downloading https://files.pythonhosted.org/packages/b1/11/cba4be5a737c6431323b89b5ade818b3bbe1df6e8261c6c70221a767c5d9/xgboost-1.0.2-py3-none-win_amd64.whl (24.6MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.0.2


In [8]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import lightgbm as lgb
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style

from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as MSE
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score
from lightgbm import LGBMClassifier
from joblib import dump, load
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
# Reading in X and Y data that resulted from feature engineering notebook. Note, this is for validation purposes only. 
# The actual models will be run as part of a pipeline that will directly feed in the data at this point.

X_wids = pd.read_csv("data/X_wids_model.csv")
y_wids = pd.read_csv("data/y_wids_model.csv")

In [3]:
print(X_wids.shape, y_wids.shape)

(91713, 295) (91713, 1)


In [32]:
categorical_cols = ['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type',
                          'apache_3j_bodysystem', 'apache_2_bodysystem', 'apache_3j_diagnosis_desc']

In [33]:
for c in categorical_cols:
    X_wids[c] = X_wids[c].str.replace("\W", "_")

In [34]:
# Create dummies for all categorical variables

X_wids_dum = pd.get_dummies(X_wids)
X_wids_dum.shape

(91713, 453)

In [35]:
# Split train-test dataset
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_wids_dum, y_wids, test_size = 0.2, 
                                                    random_state = 12, stratify = y_wids)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(73370, 453) (18343, 453) (73370, 1) (18343, 1)


### XGBoost (Manual)

####### Discussions of hyperparameters here:

In [41]:
XGB = XGBClassifier(n_jobs = 1, learning_rate = .1, n_estimators = 100, objective = 'binary:hinge', booster='gbtree', max_depth = 7 )

model_XGB = XGB.fit(X_train, y_train)
XGB_y_pred = model_XGB.predict(X_test)
XGB_y_pred

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [51]:
scores = cross_val_score(model_XGB, X_train, y_train, cv = 4, scoring = 'f1')
scores

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([0.40918942, 0.41727862, 0.42234932, 0.41271262])

In [42]:
cm = confusion_matrix(y_test, XGB_y_pred)
cm

array([[16538,   222],
       [ 1085,   498]], dtype=int64)

In [43]:
print("Precision Score:", precision_score(y_test, XGB_y_pred))
print("Recall Score:",recall_score(y_test, XGB_y_pred))
print("F1 Score:",f1_score(y_test, XGB_y_pred))

Precision Score: 0.6916666666666667
Recall Score: 0.3145925457991156
F1 Score: 0.4324793747286148


In [44]:
XGB_y_prob_pred = model_XGB.predict_proba(X_test)
XGB_y_prob_pred = XGB_y_prob_pred[:,1]
r_a_score = roc_auc_score(y_test, XGB_y_prob_pred)
print("ROC-AUC-Score:", r_a_score)

ROC-AUC-Score: 0.6506733612050472


### Random Forest (Manual)

####### Discussions of hyperparameters here:

In [45]:
rf = RandomForestClassifier(n_estimators= 100, max_depth = 7)

model_rf = rf.fit(X_train, y_train)
rf_y_pred = model_rf.predict(X_test)
rf_y_pred

  This is separate from the ipykernel package so we can avoid doing imports until


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [49]:
scores = cross_val_score(model_rf, X_train, y_train, cv = 4, scoring = 'f1')
scores

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


array([0.31      , 0.30929886, 0.31356784, 0.29778465])

In [46]:
cm = confusion_matrix(y_test, rf_y_pred)
cm

array([[16656,   104],
       [ 1264,   319]], dtype=int64)

In [47]:
print("Precision Score:", precision_score(y_test, rf_y_pred))
print("Recall Score:",recall_score(y_test, rf_y_pred))
print("F1 Score:",f1_score(y_test, rf_y_pred))

Precision Score: 0.7541371158392435
Recall Score: 0.20151610865445357
F1 Score: 0.3180458624127617


In [48]:
rf_y_prob_pred = model_rf.predict_proba(X_test)
rf_y_prob_pred = rf_y_prob_pred[:,1]
r_a_score = roc_auc_score(y_test, rf_y_prob_pred)
print("ROC-AUC-Score:", r_a_score)

ROC-AUC-Score: 0.8643809072227742


### LightGBM (Manual)

####### Discussions of hyperparameters here:

In [36]:
lgbm = lgb.LGBMClassifier(booster ='gbtree', n_estimators = 100, learning_rate = .1, max_depth = 7, objective ='binary')

model_lgbm = lgbm.fit(X_train, y_train)
lgbm_y_pred = model_lgbm.predict(X_test)
lgbm_y_pred

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [37]:
scores = cross_val_score(model_lgbm, X_train, y_train, cv = 4, scoring = 'f1')
scores

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([0.45077503, 0.47093266, 0.46949153, 0.45169713])

In [38]:
cm = confusion_matrix(y_test, lgbm_y_pred)
cm

array([[16499,   261],
       [ 1005,   578]], dtype=int64)

In [39]:
print("Precision Score:", precision_score(y_test, lgbm_y_pred))
print("Recall Score:",recall_score(y_test, lgbm_y_pred))
print("F1 Score:",f1_score(y_test, lgbm_y_pred))

Precision Score: 0.6889153754469607
Recall Score: 0.3651295009475679
F1 Score: 0.47729149463253506


In [40]:
lgbm_y_prob_pred = model_lgbm.predict_proba(X_test)
lgbm_y_prob_pred = lgbm_y_prob_pred[:,1]
r_a_score = roc_auc_score(y_test, lgbm_y_prob_pred)
print("ROC-AUC-Score:", r_a_score)

ROC-AUC-Score: 0.8974472580837268
