In [37]:
# load library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import zipfile
import urllib.request
import io
import warnings
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, KFold 
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc, roc_auc_score
warnings.filterwarnings('ignore')

In [38]:
# load raw and processed data
train_latest = pd.read_csv('https://github.com/zxyao5148/STAT3612_2023_1A_GroupProject/raw/main/data_processing/train/all_feat/train_latest.csv')
valid_latest = pd.read_csv('https://github.com/zxyao5148/STAT3612_2023_1A_GroupProject/raw/main/data_processing/valid/all_feat/valid_latest.csv')

In [39]:
train_latest.head(5)

Unnamed: 0,id,subject_id,admittime,dischtime,readmitted_within_30days,LoS,prev_admits,age,gender,ethnicity,...,CARDIAC DRUGS,PRE-NATAL VITAMINS,ANESTHETICS,ANTIBIOTICS,ANTIHYPERGLYCEMICS,SEDATIVE/HYPNOTICS,ANTIDOTES,AUTONOMIC DRUGS,VITAMINS,BIOLOGICALS
0,17195991_23542772,17195991,2110-01-11 22:47:00,2110-01-18 10:25:00,0,-0.405473,-0.409609,-0.329139,0,6,...,-0.379713,-0.013935,-0.455273,-0.449292,-0.318458,-0.418357,-0.047707,2.185809,-0.381595,-0.304994
1,13721591_20342223,13721591,2110-02-09 18:13:00,2110-02-22 20:51:00,0,-0.026227,-0.409609,-0.081773,0,6,...,-0.379713,-0.013935,0.210369,-0.130508,-0.232069,-0.418357,-0.047707,-0.30697,-0.381595,-0.304994
2,19170541_22178312,19170541,2110-02-28 21:48:00,2110-03-12 17:47:00,0,-0.134583,-0.409609,-1.380447,1,3,...,-0.379713,-0.013935,0.210369,-0.2899,-0.318458,-0.418357,-0.047707,-0.30697,-0.381595,-0.304994
3,15554295_27705504,15554295,2110-03-09 03:54:00,2110-05-18 11:34:00,0,3.061922,-0.409609,-1.256763,1,3,...,-0.379713,-0.013935,0.210369,1.463409,-0.232069,3.126321,-0.047707,1.354882,-0.381595,-0.304994
4,17643026_29919541,17643026,2110-03-25 11:15:00,2110-03-29 17:17:00,0,-0.51383,-0.409609,0.536643,1,2,...,-0.379713,-0.013935,-0.455273,-0.608684,-0.059292,-0.418357,-0.047707,-0.30697,-0.381595,1.56141


In [30]:
# separate response and features
X_train = train_latest.drop(['id','subject_id','admittime','dischtime','readmitted_within_30days'], axis=1)
y_train = train_latest["readmitted_within_30days"]
X_test = valid_latest.drop(['id','subject_id','admittime','dischtime','readmitted_within_30days'], axis=1)
y_test = valid_latest["readmitted_within_30days"]
#from adjust_imb import adjust_imb
#X_train, y_train = adjust_imb(X_train, y_train)

In [40]:
# logistic regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_accuracy_train = accuracy_score(y_train, lr.predict(X_train))
lr_auc_train = roc_auc_score(y_train, lr.predict_proba(X_train)[:, 1])
lr_accuracy_test = accuracy_score(y_test, lr.predict(X_test)) 
lr_auc_test = roc_auc_score(y_test, lr.predict_proba(X_test)[:, 1]) 
lr_accuracy_cv = cross_val_score(lr, X_train, y_train, cv=5)
lr_auc_cv = cross_val_score(lr, X_train, y_train, cv=5, scoring='roc_auc')
print(f'LR Accuracy (train): {lr_accuracy_train:.4f}')
print(f'LR AUC (train): {lr_auc_train:.4f}')
print(f'LR Accuracy (cv): {lr_accuracy_cv.mean():.4f}')
print(f'LR AUC (cv): {lr_auc_cv.mean():.4f}')
print(f'LR Accuracy (test): {lr_accuracy_test:.4f}') 
print(f'LR AUC (test): {lr_auc_test:.4f}') 

LR Accuracy (train): 0.8321
LR AUC (train): 0.7246
LR Accuracy (cv): 0.8303
LR AUC (cv): 0.7099
LR Accuracy (test): 0.8262
LR AUC (test): 0.6970


In [41]:
# Linear discriminant analysis
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
lda_accuracy_train = accuracy_score(y_train, lda.predict(X_train))
lda_auc_train = roc_auc_score(y_train, lda.predict_proba(X_train)[:, 1])
lda_accuracy_test = accuracy_score(y_test, lda.predict(X_test))
lda_auc_test = roc_auc_score(y_test, lda.predict_proba(X_test)[:, 1])
lda_accuracy_cv = cross_val_score(lda, X_train, y_train, cv=5)
lda_auc_cv = cross_val_score(lda, X_train, y_train, cv=5, scoring='roc_auc')
print(f'LDA Accuracy (train): {lda_accuracy_train:.4f}')
print(f'LDA AUC (train): {lda_auc_train:.4f}')
print(f'LDA Accuracy (cv): {lda_accuracy_cv.mean():.4f}')
print(f'LDA AUC (cv): {lda_auc_cv.mean():.4f}')
print(f'LDA Accuracy (test): {lda_accuracy_test:.4f}')
print(f'LDA AUC (test): {lda_auc_test:.4f}')

LDA Accuracy (train): 0.8301
LDA AUC (train): 0.7241
LDA Accuracy (cv): 0.8270
LDA AUC (cv): 0.7085
LDA Accuracy (test): 0.8245
LDA AUC (test): 0.6936


In [42]:
# Quadratic discriminant analysis
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)
qda_accuracy_train = accuracy_score(y_train, qda.predict(X_train))
qda_auc_train = roc_auc_score(y_train, qda.predict_proba(X_train)[:, 1])
qda_accuracy_test = accuracy_score(y_test, qda.predict(X_test))
qda_auc_test = roc_auc_score(y_test, qda.predict_proba(X_test)[:, 1])
qda_accuracy_cv = cross_val_score(qda, X_train, y_train, cv=5)
qda_auc_cv = cross_val_score(qda, X_train, y_train, cv=5, scoring='roc_auc')
print(f'QDA Accuracy (train): {qda_accuracy_train:.4f}')
print(f'QDA AUC (train): {qda_auc_train:.4f}')
print(f'QDA Accuracy (cv): {qda_accuracy_cv.mean():.4f}')
print(f'QDA AUC (cv): {qda_auc_cv.mean():.4f}')
print(f'QDA Accuracy (test): {qda_accuracy_test:.4f}')
print(f'QDA AUC (test): {qda_auc_test:.4f}')

QDA Accuracy (train): 0.7774
QDA AUC (train): 0.5963
QDA Accuracy (cv): 0.7610
QDA AUC (cv): 0.5887
QDA Accuracy (test): 0.7626
QDA AUC (test): 0.5634


In [43]:
# Gaussian naive bayes
gnb = GaussianNB()
gnb.fit(X_train, y_train)
gnb_accuracy_train = accuracy_score(y_train, gnb.predict(X_train))
gnb_auc_train = roc_auc_score(y_train, gnb.predict_proba(X_train)[:, 1])
gnb_accuracy_test = accuracy_score(y_test, gnb.predict(X_test))
gnb_auc_test = roc_auc_score(y_test, gnb.predict_proba(X_test)[:, 1])
gnb_accuracy_cv = cross_val_score(gnb, X_train, y_train, cv=5)
gnb_auc_cv = cross_val_score(gnb, X_train, y_train, cv=5, scoring='roc_auc')
print(f'GNB Accuracy (train): {gnb_accuracy_train:.4f}')
print(f'GNB AUC (train): {gnb_auc_train:.4f}')
print(f'GNB Accuracy (cv): {gnb_accuracy_cv.mean():.4f}')
print(f'GNB AUC (cv): {gnb_auc_cv.mean():.4f}')
print(f'GNB Accuracy (test): {gnb_accuracy_test:.4f}')
print(f'GNB AUC (test): {gnb_auc_test:.4f}')

GNB Accuracy (train): 0.8139
GNB AUC (train): 0.6479
GNB Accuracy (cv): 0.8006
GNB AUC (cv): 0.6392
GNB Accuracy (test): 0.7974
GNB AUC (test): 0.6160


In [44]:
# K-nearest neighbors
param_grid = {'n_neighbors': [3, 5, 7, 9, 11]}
knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=5)
grid_search.fit(X_train, y_train)
y_train_pred = grid_search.predict_proba(X_train)[:, 1]
y_train_pred_class = grid_search.predict(X_train)
y_test_pred = grid_search.predict_proba(X_test)[:, 1]
y_test_pred_class = grid_search.predict(X_test)
knn_accuracy_train = accuracy_score(y_train, y_train_pred_class)
knn_auc_train = roc_auc_score(y_train, y_train_pred)
knn_accuracy_test = accuracy_score(y_test, y_test_pred_class)
knn_auc_test = roc_auc_score(y_test, y_test_pred)
knn_accuracy_cv = grid_search.best_score_
knn_auc_cv = cross_val_score(grid_search, X_train, y_train, cv=5, scoring='roc_auc')
print(f'KNN Best Parameters: {grid_search.best_params_}')
print(f'KNN Accuracy (train): {knn_accuracy_train:.4f}')
print(f'KNN AUC (train): {knn_auc_train:.4f}')
print(f'KNN Accuracy (cv): {knn_accuracy_cv:.4f}')
print(f'KNN AUC (cv): {knn_auc_cv.mean():.4f}')
print(f'KNN Accuracy (test): {knn_accuracy_test:.4f}')
print(f'KNN AUC (test): {knn_auc_test:.4f}')

KNN Best Parameters: {'n_neighbors': 11}
KNN Accuracy (train): 0.8387
KNN AUC (train): 0.8151
KNN Accuracy (cv): 0.8294
KNN AUC (cv): 0.6546
KNN Accuracy (test): 0.8172
KNN AUC (test): 0.6445


In [None]:
# Support vector machine
param_grid = {
    'C': [0.1, 1, 10],
    #'kernel': ['linear', 'rbf', 'poly']
    #'degree': [2, 3, 4]
}
svm = SVC()
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)
y_train_pred = grid_search.predict_proba(X_train)[:, 1]
y_train_pred_class = grid_search.predict(X_train)
y_test_pred = grid_search.predict_proba(X_test)[:, 1]
y_test_pred_class = grid_search.predict(X_test)
svm_accuracy_train = accuracy_score(y_train, y_train_pred_class)
svm_auc_train = roc_auc_score(y_train, y_train_pred)
svm_accuracy_test = accuracy_score(y_test, y_test_pred_class)
svm_auc_test = roc_auc_score(y_test, y_test_pred)
svm_accuracy_cv = cross_val_score(grid_search, X_train, y_train, cv=5)
svm_auc_cv = cross_val_score(grid_search, X_train, y_train, cv=5, scoring='roc_auc')
print(f'SVM Best Parameters: {grid_search.best_params_}')
print(f'SVM Accuracy (train): {svm_accuracy_train:.4f}')
print(f'SVM AUC (train): {svm_auc_train:.4f}')
print(f'SVM Accuracy (cv): {svm_accuracy_cv.mean():.4f}')
print(f'SVM AUC (cv): {svm_auc_cv.mean():.4f}')
print(f'SVM Accuracy (test): {svm_accuracy_test:.4f}')
print(f'SVM AUC (test): {svm_auc_test:.4f}')

In [None]:
# Decision Tree
param_grid = {
    'max_depth': [3, 5, 7, 9],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 4]
}
dt = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)
y_train_pred = grid_search.predict_proba(X_train)[:, 1]
y_train_pred_class = grid_search.predict(X_train)
y_test_pred = grid_search.predict_proba(X_test)[:, 1]
y_test_pred_class = grid_search.predict(X_test)
dt_accuracy_train = accuracy_score(y_train, y_train_pred_class)
dt_auc_train = roc_auc_score(y_train, y_train_pred)
dt_accuracy_test = accuracy_score(y_test, y_test_pred_class)
dt_auc_test = roc_auc_score(y_test, y_test_pred)
dt_accuracy_cv = cross_val_score(grid_search, X_train, y_train, cv=5)
dt_auc_cv = cross_val_score(grid_search, X_train, y_train, cv=5, scoring='roc_auc')
print(f'DT Best Parameters: {grid_search.best_params_}')
print(f'DT Accuracy (train): {dt_accuracy_train:.4f}')
print(f'DT AUC (train): {dt_auc_train:.4f}')
print(f'DT Accuracy (test): {dt_accuracy_test:.4f}') 
print(f'DT AUC (test): {dt_auc_test:.4f}')
print(f'DT Accuracy (cv): {dt_accuracy_cv.mean():.4f}')
print(f'DT AUC (cv): {dt_auc_cv.mean():.4f}')

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_samples': [0.5, 0.7, 0.9],
    'max_features': [0.5, 0.7, 0.9]
}
bagging = BaggingClassifier()
grid_search = GridSearchCV(estimator=bagging, param_grid=param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)
y_train_pred = grid_search.predict_proba(X_train)[:, 1]
y_train_pred_class = grid_search.predict(X_train)
y_test_pred = grid_search.predict_proba(X_test)[:, 1]
y_test_pred_class = grid_search.predict(X_test)
bagging_accuracy_train = accuracy_score(y_train, y_train_pred_class)
bagging_auc_train = roc_auc_score(y_train, y_train_pred)
bagging_accuracy_test = accuracy_score(y_test, y_test_pred_class)
bagging_auc_test = roc_auc_score(y_test, y_test_pred)
bagging_accuracy_cv = cross_val_score(grid_search, X_train, y_train, cv=5)
print(f'Bagging Best Parameters: {grid_search.best_params_}')
print(f'Bagging Accuracy (train): {bagging_accuracy_train:.4f}')
print(f'Bagging AUC (train): {bagging_auc_train:.4f}')
print(f'Bagging Accuracy (cv): {bagging_accuracy_cv.mean():.4f}')
print(f'Bagging AUC (cv): {cross_val_score(grid_search, X_train, y_train, cv=5, scoring="roc_auc").mean():.4f}')
print(f'Bagging Accuracy (test): {bagging_accuracy_test:.4f}')
print(f'Bagging AUC (test): {bagging_auc_test:.4f}')

In [None]:
# Random forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 4]
}
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)
y_train_pred = grid_search.predict_proba(X_train)[:, 1]
y_train_pred_class = grid_search.predict(X_train)
y_test_pred = grid_search.predict_proba(X_test)[:, 1]
y_test_pred_class = grid_search.predict(X_test)
print(f'RF Best Parameters: {grid_search.best_params_}')
print(f'RF Accuracy (train): {accuracy_score(y_train, y_train_pred_class):.4f}')
print(f'RF AUC (train): {roc_auc_score(y_train, y_train_pred):.4f}')
print(f'RF Accuracy (cv): {grid_search.best_score_:.4f}')
print(f'RF AUC (cv): {roc_auc_score(y_train, y_test_pred):.4f}')
print(f'RF Accuracy (test): {accuracy_score(y_test, y_test_pred_class):.4f}')
print(f'RF AUC (test): {roc_auc_score(y_test, y_test_pred):.4f}')

In [47]:
# Gradient Boosting
param_grid = {
    'learning_rate': [0.1, 0.05, 0.01],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 3, 4],
    'n_estimators': [50, 100, 200]
}
gb = GradientBoostingClassifier()
grid_search = GridSearchCV(estimator=gb, param_grid=param_grid, cv=5, scoring="roc_auc")
grid_search.fit(X_train, y_train)
y_train_pred = grid_search.predict_proba(X_train)[:, 1]
y_test_pred = grid_search.predict_proba(X_test)[:, 1]
y_cv_pred = grid_search.predict_proba(X_train)[:, 1]
gb_accuracy_train = accuracy_score(y_train, grid_search.predict(X_train))
gb_accuracy_test = accuracy_score(y_test, grid_search.predict(X_test))
gb_accuracy_cv = grid_search.best_score_
gb_auc_train = roc_auc_score(y_train, y_train_pred)
gb_auc_test = roc_auc_score(y_test, y_test_pred)
gb_auc_cv = roc_auc_score(y_train, y_cv_pred)
print(f'GB Best Parameters: {grid_search.best_params_}')
print(f'GB Accuracy (train): {gb_accuracy_train:.4f}')
print(f'GB AUC (train): {gb_auc_train:.4f}')
print(f'GB Accuracy (cv): {gb_accuracy_cv:.4f}')
print(f'GB AUC (cv): {gb_auc_cv:.4f}')
print(f'GB Accuracy (test): {gb_accuracy_test:.4f}')
print(f'GB AUC (test): {gb_auc_test:.4f}')

GB Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 50}
GB Accuracy (train): 0.8765
GB AUC (train): 0.8622
GB Accuracy (cv): 0.7574
GB AUC (cv): 0.8622
GB Accuracy (test): 0.8366
GB AUC (test): 0.7476


In [46]:
# AdaBoost
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1]
}
ada = AdaBoostClassifier()
grid_search = GridSearchCV(estimator=ada, param_grid=param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)
y_train_pred = grid_search.predict(X_train)
y_train_prob = grid_search.predict_proba(X_train)[:, 1]
ada_accuracy_train = accuracy_score(y_train, y_train_pred)
ada_auc_train = roc_auc_score(y_train, y_train_prob)
ada_accuracy_test = accuracy_score(y_test, grid_search.predict(X_test))
y_test_prob = grid_search.predict_proba(X_test)[:, 1]
ada_auc_test = roc_auc_score(y_test, y_test_prob)
ada_accuracy_cv = cross_val_score(grid_search, X_train, y_train, cv=5)
print(f'AdaBoost Accuracy (train): {ada_accuracy_train:.4f}')
print(f'AdaBoost AUC (train): {ada_auc_train:.4f}')
print(f'AdaBoost Accuracy (cv): {ada_accuracy_cv.mean():.4f}')
print(f'AdaBoost AUC (cv): {ada_auc_test:.4f}')
print(f'AdaBoost Accuracy (test): {ada_accuracy_test:.4f}')
print(f'AdaBoost AUC (test): {ada_auc_test:.4f}')

AdaBoost Accuracy (train): 0.8311
AdaBoost AUC (train): 0.7536
AdaBoost Accuracy (cv): 0.7335
AdaBoost AUC (cv): 0.7126
AdaBoost Accuracy (test): 0.8271
AdaBoost AUC (test): 0.7126


In [45]:
# XGBoost
param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 500, 1000]
}
xgb = XGBClassifier()
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)
y_pred_train = grid_search.predict(X_train)
y_pred_proba_train = grid_search.predict_proba(X_train)[:,1]
xgb_auc_train = roc_auc_score(y_train, y_pred_proba_train)
xgb_accuracy_train = accuracy_score(y_train, y_pred_train)
y_pred_test = grid_search.predict(X_test)
y_pred_proba_test = grid_search.predict_proba(X_test)[:,1]
xgb_auc_test = roc_auc_score(y_test, y_pred_proba_test)
xgb_accuracy_test = accuracy_score(y_test, y_pred_test)
xgb_auc_cv = cross_val_score(grid_search, X_train, y_train, cv=5, scoring='roc_auc')
xgb_accuracy_cv = cross_val_score(grid_search, X_train, y_train, cv=5)
print(f'XGBoost Best Hyperparameters: {grid_search.best_params_}')
print(f'XGBoost Accuracy (train): {xgb_accuracy_train:.4f}')
print(f'XGBoost AUC (train): {xgb_auc_train:.4f}')
print(f'XGBoost Accuracy (cv): {xgb_accuracy_cv.mean():.4f}')
print(f'XGBoost AUC (cv): {xgb_auc_cv.mean():.4f}')
print(f'XGBoost Accuracy (test): {xgb_accuracy_test:.4f}')
print(f'XGBoost AUC (test): {xgb_auc_test:.4f}')

XGBoost Best Hyperparameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 500}
XGBoost Accuracy (train): 0.8724
XGBoost AUC (train): 0.8614
XGBoost Accuracy (cv): 0.7594
XGBoost AUC (cv): 0.7594
XGBoost Accuracy (test): 0.8288
XGBoost AUC (test): 0.7276
