In [1]:
# load library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import zipfile
import urllib.request
import io
import warnings
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, KFold 
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc, roc_auc_score
warnings.filterwarnings('ignore')

In [2]:
# load raw and processed data
train_merged_df_latest = pd.read_csv('https://github.com/zxyao5148/STAT3612_2023_1A_GroupProject/raw/main/data_processing/train_merged_latest_feat.csv')
train_merged_df_median = pd.read_csv('https://github.com/zxyao5148/STAT3612_2023_1A_GroupProject/raw/main/data_processing/train_merged_median_feat.csv')
train_smote_df_latest = pd.read_csv('https://github.com/zxyao5148/STAT3612_2023_1A_GroupProject/raw/main/data_processing/train_smote_latest_feat.csv')
train_smote_df_median = pd.read_csv('https://github.com/zxyao5148/STAT3612_2023_1A_GroupProject/raw/main/data_processing/train_smote_median_feat.csv')

In [6]:
# separate response and features
X_train = train_merged_df_latest.drop(['id','subject_id','readmitted_within_30days'], axis=1)
y_train = train_merged_df_latest["readmitted_within_30days"]

In [16]:
# logistic regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_accuracy_train = accuracy_score(y_train, lr.predict(X_train))
lr_auc_train = roc_auc_score(y_train, lr.predict_proba(X_train)[:, 1])
lr_accuracy_test = cross_val_score(lr, X_train, y_train, cv=5)
lr_auc_test = cross_val_score(lr, X_train, y_train, cv=5, scoring='roc_auc')
print(f'LR Accuracy (train): {lr_accuracy_train:.4f}')
print(f'LR AUC (train): {lr_auc_train:.4f}')
print(f'LR Accuracy (cv): {lr_accuracy_test.mean():.4f}')
print(f'LR AUC (cv): {lr_auc_test.mean():.4f}')

LR Accuracy (train): 0.7558
LR AUC (train): 0.8337
LR Accuracy (cv): 0.7448
LR AUC (cv): 0.8196


In [17]:
# Linear discriminant analysis
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
lda_accuracy_train = accuracy_score(y_train, lda.predict(X_train))
lda_auc_train = roc_auc_score(y_train, lda.predict_proba(X_train)[:, 1])
lda_accuracy_test = cross_val_score(lda, X_train, y_train, cv=5)
lda_auc_test = cross_val_score(lda, X_train, y_train, cv=5, scoring='roc_auc')
print(f'LDA Accuracy (train): {lda_accuracy_train:.4f}')
print(f'LDA AUC (train): {lda_auc_train:.4f}')
print(f'LDA Accuracy (cv): {lda_accuracy_test.mean():.4f}')
print(f'LDA AUC (cv): {lda_auc_test.mean():.4f}')

LDA Accuracy (train): 0.7508
LDA AUC (train): 0.8318
LDA Accuracy (cv): 0.7419
LDA AUC (cv): 0.8183


In [18]:
# Quadratic discriminant analysis
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)
qda_accuracy_train = accuracy_score(y_train, qda.predict(X_train))
qda_auc_train = roc_auc_score(y_train, qda.predict_proba(X_train)[:, 1])
qda_accuracy_test = cross_val_score(qda, X_train, y_train, cv=5)
qda_auc_test = cross_val_score(qda, X_train, y_train, cv=5, scoring='roc_auc')
print(f'QDA Accuracy (train): {qda_accuracy_train:.4f}')
print(f'QDA AUC (train): {qda_auc_train:.4f}')
print(f'QDA Accuracy (cv): {qda_accuracy_test.mean():.4f}')
print(f'QDA AUC (cv): {qda_auc_test.mean():.4f}')

QDA Accuracy (train): 0.5159
QDA AUC (train): 0.5160
QDA Accuracy (cv): 0.5156
QDA AUC (cv): 0.6820


In [19]:
# Gaussian naive bayes
gnb = GaussianNB()
gnb.fit(X_train, y_train)
gnb_accuracy_train = accuracy_score(y_train, gnb.predict(X_train))
gnb_auc_train = roc_auc_score(y_train, gnb.predict_proba(X_train)[:, 1])
gnb_accuracy_test = cross_val_score(gnb, X_train, y_train, cv=5)
gnb_auc_test = cross_val_score(gnb, X_train, y_train, cv=5, scoring='roc_auc')
print(f'GNB Accuracy (train): {gnb_accuracy_train:.4f}')
print(f'GNB AUC (train): {gnb_auc_train:.4f}')
print(f'GNB Accuracy (cv): {gnb_accuracy_test.mean():.4f}')
print(f'GNB AUC (cv): {gnb_auc_test.mean():.4f}')

GNB Accuracy (train): 0.5236
GNB AUC (train): 0.5244
GNB Accuracy (cv): 0.5183
GNB AUC (cv): 0.5192


In [16]:
# K-nearest neighbors
param_grid = {'n_neighbors': [3, 5, 7, 9, 11]}
knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=5)
grid_search.fit(X_train, y_train)
y_train_pred = grid_search.predict_proba(X_train)[:, 1]
y_train_pred_class = grid_search.predict(X_train)
knn_accuracy_train = accuracy_score(y_train, y_train_pred_class)
knn_auc_train = roc_auc_score(y_train, y_train_pred)
knn_accuracy_test = grid_search.best_score_
knn_auc_test = cross_val_score(grid_search, X_train, y_train, cv=5, scoring='roc_auc')
print(f'KNN Best Parameters: {grid_search.best_params_}')
print(f'KNN Accuracy (train): {knn_accuracy_train:.4f}')
print(f'KNN AUC (train): {knn_auc_train:.4f}')
print(f'KNN Accuracy (cv): {knn_accuracy_test:.4f}')
print(f'KNN AUC (cv): {knn_auc_test.mean():.4f}')

KNN Best Parameters: {'n_neighbors': 3}


AttributeError: 'NoneType' object has no attribute 'split'

In [4]:
# Support vector machine
param_grid = {
    'C': [0.1, 1, 10],
    #'kernel': ['linear', 'rbf', 'poly']
    #'degree': [2, 3, 4]
}
svm = SVC()
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)
y_train_pred = grid_search.predict_proba(X_train)[:, 1]
y_train_pred_class = grid_search.predict(X_train)
svm_accuracy_train = accuracy_score(y_train, y_train_pred_class)
svm_auc_train = roc_auc_score(y_train, y_train_pred)
svm_accuracy_test = cross_val_score(grid_search, X_train, y_train, cv=5)
svm_auc_test = cross_val_score(grid_search, X_train, y_train, cv=5, scoring='roc_auc')
print(f'SVM Best Parameters: {grid_search.best_params_}')
print(f'SVM Accuracy (train): {svm_accuracy_train:.4f}')
print(f'SVM AUC (train): {svm_auc_train:.4f}')
print(f'SVM Accuracy (cv): {svm_accuracy_test.mean():.4f}')
print(f'SVM AUC (cv): {svm_auc_test.mean():.4f}')

SVM Best Parameters: {'C': 10}
SVM Accuracy (train): 0.8994
SVM Accuracy (cv): 0.8474


In [14]:
# Decision Tree
param_grid = {
    'max_depth': [3, 5, 7, 9],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 4]
}
dt = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)
y_train_pred = grid_search.predict_proba(X_train)[:, 1]
y_train_pred_class = grid_search.predict(X_train)
dt_accuracy_train = accuracy_score(y_train, y_train_pred_class)
dt_auc_train = roc_auc_score(y_train, y_train_pred)
dt_accuracy_test = cross_val_score(grid_search, X_train, y_train, cv=5)
dt_auc_test = cross_val_score(grid_search, X_train, y_train, cv=5, scoring='roc_auc')
print(f'DT Best Parameters: {grid_search.best_params_}')
print(f'DT Accuracy (train): {dt_accuracy_train:.4f}')
print(f'DT AUC (train): {dt_auc_train:.4f}')
print(f'DT Accuracy (cv): {dt_accuracy_test.mean():.4f}')
print(f'DT AUC (cv): {dt_auc_test.mean():.4f}')

DT Best Parameters: {'max_depth': 9, 'min_samples_leaf': 4, 'min_samples_split': 4}
DT Accuracy (train): 0.8627
DT AUC (train): 0.9262
DT Accuracy (cv): 0.9012
DT AUC (cv): 0.9014


In [22]:
# Bagging
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_samples': [0.5, 0.7, 0.9],
    'max_features': [0.5, 0.7, 0.9]
}
bagging = BaggingClassifier()
grid_search = GridSearchCV(estimator=bagging, param_grid=param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)
y_train_pred = grid_search.predict_proba(X_train)[:, 1]
y_train_pred_class = grid_search.predict(X_train)
bagging_accuracy_train = accuracy_score(y_train, y_train_pred_class)
bagging_auc_train = roc_auc_score(y_train, y_train_pred)
bagging_accuracy_test = cross_val_score(grid_search, X_train, y_train, cv=5)
print(f'Bagging Best Parameters: {grid_search.best_params_}')
print(f'Bagging Accuracy (train): {bagging_accuracy_train:.4f}')
print(f'Bagging AUC (train): {bagging_auc_train:.4f}')
print(f'Bagging Accuracy (cv): {bagging_accuracy_test.mean():.4f}')
print(f'Bagging AUC (cv): {cross_val_score(grid_search, X_train, y_train, cv=5, scoring="roc_auc").mean():.4f}')

Bagging Best Parameters: {'max_features': 0.9, 'max_samples': 0.9, 'n_estimators': 200}
Bagging Accuracy (train): 1.0000
Bagging Accuracy (cv): 0.8643


In [46]:
# Random forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 4]
}
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)
y_train_pred = grid_search.predict_proba(X_train)[:, 1]
y_train_pred_class = grid_search.predict(X_train)
y_test_pred = grid_search.predict_proba(X_test)[:, 1]
y_test_pred_class = grid_search.predict(X_test)
print(f'RF Best Parameters: {grid_search.best_params_}')
print(f'RF Accuracy (train): {accuracy_score(y_train, y_train_pred_class):.4f}')
print(f'RF AUC (train): {roc_auc_score(y_train, y_train_pred):.4f}')
print(f'RF Accuracy (cv): {grid_search.best_score_:.4f}')
print(f'RF AUC (cv): {roc_auc_score(y_train, y_test_pred):.4f}')


Random Forests Training Accuracy: 0.9999
Random Forests Testing Accuracy: 0.8589


In [21]:
# Gradient boosting
param_grid = {
    'learning_rate': [0.1, 0.05, 0.01],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 3, 4],
    'n_estimators': [50, 100, 200]
}
gb = GradientBoostingClassifier()
grid_search = GridSearchCV(estimator=gb, param_grid=param_grid, cv=5, scoring="roc_auc")
grid_search.fit(X_train, y_train)
y_train_pred = grid_search.predict_proba(X_train)[:, 1]
y_cv_pred = grid_search.predict_proba(X_train)[:, 1]
gb_accuracy_train = accuracy_score(y_train, grid_search.predict(X_train))
gb_accuracy_cv = grid_search.best_score_
gb_auc_train = roc_auc_score(y_train, y_train_pred)
gb_auc_cv = roc_auc_score(y_train, y_cv_pred)
print(f'GB Best Parameters: {grid_search.best_params_}')
print(f'GB Accuracy (train): {gb_accuracy_train:.4f}')
print(f'GB AUC (train): {gb_auc_train:.4f}')
print(f'GB Accuracy (cv): {gb_accuracy_cv:.4f}')
print(f'GB AUC (cv): {gb_auc_cv:.4f}')

GB Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 200}
GB Accuracy (train): 0.9431
GB AUC (train): 0.9826
GB Accuracy (cv): 0.9435
GB AUC (cv): 0.9826


In [23]:
# AdaBoost
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1]
}
ada = AdaBoostClassifier()
grid_search = GridSearchCV(estimator=ada, param_grid=param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)
y_train_pred = grid_search.predict(X_train)
y_train_prob = grid_search.predict_proba(X_train)[:, 1]
ada_accuracy_train = accuracy_score(y_train, y_train_pred)
ada_auc_train = roc_auc_score(y_train, y_train_prob)
ada_accuracy_test = cross_val_score(grid_search, X_train, y_train, cv=5)
y_test_prob = grid_search.predict_proba(X_train)[:, 1]
ada_auc_test = roc_auc_score(y_train, y_test_prob)
print(f'AdaBoost Accuracy (train): {ada_accuracy_train:.4f}')
print(f'AdaBoost AUC (train): {ada_auc_train:.4f}')
print(f'AdaBoost Accuracy (cv): {ada_accuracy_test.mean():.4f}')
print(f'AdaBoost AUC (cv): {ada_auc_test:.4f}')

AdaBoost Accuracy (train): 0.9013
AdaBoost AUC (train): 0.9494
AdaBoost Accuracy (cv): 0.9310
AdaBoost AUC (cv): 0.9494


In [24]:
param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 500, 1000]
}
xgb = XGBClassifier()
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)
y_pred_train = grid_search.predict(X_train)
y_pred_proba_train = grid_search.predict_proba(X_train)[:,1]
xgb_auc_train = roc_auc_score(y_train, y_pred_proba_train)
xgb_accuracy_train = accuracy_score(y_train, y_pred_train)
xgb_auc_test = cross_val_score(grid_search, X_train, y_train, cv=5, scoring='roc_auc')
xgb_accuracy_test = cross_val_score(grid_search, X_train, y_train, cv=5)
print(f'XGBoost Best Hyperparameters: {grid_search.best_params_}')
print(f'XGBoost Accuracy (train): {xgb_accuracy_train:.4f}')
print(f'XGBoost AUC (train): {xgb_auc_train:.4f}')
print(f'XGBoost Accuracy (cv): {xgb_accuracy_test.mean():.4f}')
print(f'XGBoost AUC (cv): {xgb_auc_test.mean():.4f}')

KeyboardInterrupt: 