# 70/30 strategy for model evaluation (revision 2)

In this notebook we will carry out a 70/30 model evaluation for the second revision of
our sepsis manuscript. We will do this for the laboratory and the laboratory + clinical model.

We will first setup the notebook.

In [None]:
%cd ../src
%pwd
%matplotlib inline
%load_ext autoreload
%autoreload 2

Import all important libraries

In [None]:
# general imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import string
import datetime as dt

# utils import
from utils.files import get_latest_version
from utils.cross_validation import cross_validate_ROC, cross_validate_risk_ROC, compare_models, cross_validate_calibration
from utils.risk_scores import read_data_risk_score
from xgboost.cross_validate import read_data

# xgboost
from xgboost import XGBClassifier

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, auc

Then, we define the model version to use. 
This is version 20 for shock, and 22 for mortality.

In [None]:
model_version_shock = 20
model_version_mort = 22

Next, we will read all the data.

In [None]:
data_dict_lab = read_data(model = 1, 
                          out = 3, # shock
                          version = model_version_mort)

data_dict_lab_clinical = read_data(model = 3, 
                                   out = 3, # shock
                                   version = model_version_mort)

Perform a 70/30 split for both datasets

In [None]:
train_lab_x, test_lab_x, train_lab_y, test_lab_y = train_test_split(
    data_dict_lab['x'], data_dict_lab['y'],  test_size=0.3, random_state=1106)

train_clin_x, test_clin_x, train_clin_y, test_clin_y = train_test_split(
    data_dict_lab_clinical['x'], data_dict_lab_clinical['y'],  test_size=0.3, random_state=1106)

Train the laboratory and clinical model

In [None]:
lab_model = XGBClassifier()
eval_set = [(train_lab_x, train_lab_y), (test_lab_x, test_lab_y)]
lab_model.fit(train_lab_x, train_lab_y, 
              #eval_metric=["logloss"], eval_set=eval_set, early_stopping_rounds=5,
              verbose=False)

clin_model = XGBClassifier()
eval_set = [(train_clin_x, train_clin_y), (test_clin_x, test_clin_y)]
clin_model.fit(train_clin_x, train_clin_y, 
              #eval_metric=["logloss"], eval_set=eval_set, early_stopping_rounds=5,
              verbose=False)

Plot a ROC curve for the laboratory model

In [None]:
fpr, tpr, thresholds = roc_curve(test_lab_y, lab_model.predict_proba(test_lab_x)[:,1])
#tprs[-1][0] = 0.0
roc_auc = auc(fpr, tpr)

# Points
plt.plot(fpr, tpr, lw=1, alpha=0.3,
         label='AUC = %0.2f' % (roc_auc))

# Reference line
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
                 label='Chance', alpha=.8)

plt.xlim([0, 1])
plt.ylim([0, 1])
plt.xlabel('1 - specificity', fontsize=12)
plt.ylabel('Sensitivity', fontsize=12)
plt.legend(loc="lower right",
           frameon=True,
           fontsize=8)
plt.grid(linestyle=':')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.savefig('lab.png')

Plot a ROC curve for the clinical model

In [None]:
fpr, tpr, thresholds = roc_curve(test_clin_y, clin_model.predict_proba(test_clin_x)[:,1])
#tprs[-1][0] = 0.0
roc_auc = auc(fpr, tpr)

# Points
plt.plot(fpr, tpr, lw=1, alpha=0.3,
         label='AUC = %0.2f' % (roc_auc))

# Reference line
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
                 label='Chance', alpha=.8)

plt.xlim([0, 1])
plt.ylim([0, 1])
plt.xlabel('1 - specificity', fontsize=12)
plt.ylabel('Sensitivity', fontsize=12)
plt.legend(loc="lower right",
           frameon=True,
           fontsize=8)
plt.grid(linestyle=':')
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.savefig('clinical.png')