In [None]:
##import necessary packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
import xgboost as xgboost
from xgboost import XGBClassifier
from sklearn.calibration import calibration_curve
from sklearn.metrics import auc, roc_curve, recall_score, precision_score, balanced_accuracy_score, brier_score_loss


In [None]:
xgbcomplete = XGBClassifier(learning_rate = 0.3, n_estimators=100, objective='binary:logistic', reg_alpha=1)

In [None]:
filename = 'revisions 2/injury variables only.csv'
complete_data = pd.read_csv(filename, header=0)
complete_data.head()

In [None]:
##injury model

X_data_comp = complete_data.drop(['mortality', 'inc_key'], axis = 1)
y_data_comp = complete_data['mortality']

auc_scores = []
recall_scores = []
precision_scores = []
ba_scores = []
brier_scores = []

sss = StratifiedShuffleSplit(n_splits=10, test_size=0.50)
    
sss.get_n_splits(X_data_comp, y_data_comp)
    
for train_index, test_index in sss.split(X_data_comp, y_data_comp):
    X_train_CV, X_test_CV = X_data_comp.iloc[train_index], X_data_comp.iloc[test_index]
    y_train_CV, y_test_CV = y_data_comp.iloc[train_index], y_data_comp.iloc[test_index]
    
    xgbcomplete.fit(X_train_CV, y_train_CV)
    
    y_test_preds = xgbcomplete.predict(X_test_CV)
    y_test_proba = xgbcomplete.predict_proba(X_test_CV)[:,1]
    test_FPR, test_TPR, test_threshold = roc_curve(y_test_CV, y_test_proba)
    auc_scores.append(auc(test_FPR, test_TPR))
    recall_scores.append(recall_score(y_test_CV, y_test_preds))
    precision_scores.append(precision_score(y_test_CV, y_test_preds))
    ba_scores.append(balanced_accuracy_score(y_test_CV, y_test_preds))
    brier_scores.append(brier_score_loss(y_test_CV, y_test_preds))
    
print(np.mean(auc_scores))
print(np.std(auc_scores))

print(np.mean(recall_scores))
print(np.std(recall_scores))

print(np.mean(precision_scores))
print(np.std(precision_scores))

print(np.mean(ba_scores))
print(np.std(ba_scores))

print(np.mean(brier_scores))
print(np.std(brier_scores))

In [None]:
##how to apply this model to external data with injury variables already defined

data = 'external data.csv'
X_data_external = data.drop(['patient_identifier'], axis=1)

predicted_mortality = xgbcomplete.predict_proba(X_data_external)

predicted_mortality.to_csv('predicted.csv')

##load data into statistical software of choice