In [1]:
# load library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import zipfile
import urllib.request
import io
import warnings
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, KFold 
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
warnings.filterwarnings('ignore')

In [2]:
# load raw and processed data
train_merged_df_latest = pd.read_csv('https://github.com/zxyao5148/STAT3612_2023_1A_GroupProject/raw/main/data_processing/train_merged_latest_feat.csv')
train_merged_df_median = pd.read_csv('https://github.com/zxyao5148/STAT3612_2023_1A_GroupProject/raw/main/data_processing/train_merged_median_feat.csv')
test_merged_df_latest = pd.read_csv('https://github.com/zxyao5148/STAT3612_2023_1A_GroupProject/raw/main/data_processing/test_merged_latest_feat.csv')
test_merged_df_median = pd.read_csv('https://github.com/zxyao5148/STAT3612_2023_1A_GroupProject/raw/main/data_processing/test_merged_median_feat.csv')

In [9]:
# separate response and features
X_train = train_merged_df_latest.iloc[:, 3:]
y_train = train_merged_df_latest["readmitted_within_30days"]

In [10]:
# logistic regression
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_accuracy_train = accuracy_score(y_train, lr.predict(X_train))
lr_accuracy_test = cross_val_score(lr, X_train, y_train, cv=5)
print(f'LR Accuracy (train): {lr_accuracy_train:.4f}')
print(f'LR Accuracy (cv): {lr_accuracy_test.mean():.4f}')

LR Accuracy (train): 0.8515
LR Accuracy (cv): 0.8496


In [12]:
# Linear discriminant analysis
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
lda_accuracy_train = accuracy_score(y_train, lda.predict(X_train))
lda_accuracy_test = cross_val_score(lda, X_train, y_train, cv=5)
print(f'LDA Accuracy (train): {lda_accuracy_train:.4f}')
print(f'LDA Accuracy (cv): {lda_accuracy_test.mean():.4f}')

LDA Accuracy (train): 0.8515
LDA Accuracy (cv): 0.8463


In [14]:
# Quadratic discriminant analysis
from sklearn.model_selection import cross_val_score
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)
qda_accuracy_train = accuracy_score(y_train, qda.predict(X_train))
qda_accuracy_test = cross_val_score(qda, X_train, y_train, cv=5)
print(f'QDA Accuracy (train): {qda_accuracy_train:.4f}')
print(f'QDA Accuracy (cv): {qda_accuracy_test.mean():.4f}')

QDA Accuracy (train): 0.1902
QDA Accuracy (cv): 0.1914


In [15]:
# Gaussian naive bayes
gnb = GaussianNB()
gnb.fit(X_train, y_train)
gnb_accuracy_train = accuracy_score(y_train, gnb.predict(X_train))
gnb_accuracy_test = cross_val_score(gnb, X_train, y_train, cv=5)
print(f'GNB Accuracy (train): {gnb_accuracy_train:.4f}')
print(f'GNB Accuracy (cv): {gnb_accuracy_test.mean():.4f}')

GNB Accuracy (train): 0.1961
GNB Accuracy (cv): 0.1916


In [16]:
# K-nearest neighbors
param_grid = {'n_neighbors': [3, 5, 7, 9, 11]}
knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=5)
grid_search.fit(X_train, y_train)
print(f'KNN Best Parameters: {grid_search.best_params_}')
print(f'KNN Accuracy (train): {accuracy_score(y_train, grid_search.predict(X_train)):.4f}')
print(f'KNN Accuracy (cv): {grid_search.best_score_:.4f}')

KNN Best Parameters: {'n_neighbors': 3}


AttributeError: 'NoneType' object has no attribute 'split'

In [23]:
# Support vector machine
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    #'degree': [2, 3, 4]
}
svm = SVC()
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
print(f'SVM Best Parameters: {grid_search.best_params_}')
print(f'SVM Accuracy (train): {accuracy_score(y_train, grid_search.predict(X_train)):.4f}')
print(f'SVM Accuracy (cv): {grid_search.best_score_:.4f}')

In [17]:
# Decision Tree
param_grid = {
    'max_depth': [3, 5, 7, 9],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 4]
}
dt = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
print(f'DT Best Parameters: {grid_search.best_params_}')
print(f'DT Accuracy (train): {accuracy_score(y_train, grid_search.predict(X_train)):.4f}')
print(f'DT Accuracy (cv): {grid_search.best_score_:.4f}')

DT Best Parameters: {'max_depth': 7, 'min_samples_leaf': 4, 'min_samples_split': 4}
DT Accuracy (train): 0.8611
DT Accuracy (cv): 0.8488


In [22]:
# Bagging
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_samples': [0.5, 0.7, 0.9],
    'max_features': [0.5, 0.7, 0.9]
}
grid_search = GridSearchCV(estimator=BaggingClassifier(), param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
print(f'Bagging Best Parameters: {grid_search.best_params_}')
print(f'Bagging Accuracy (train): {accuracy_score(y_train, grid_search.predict(X_train)):.4f}') 
print(f'Bagging Accuracy (cv): {grid_search.best_score_:.4f}')

Bagging Best Parameters: {'max_features': 0.9, 'max_samples': 0.9, 'n_estimators': 200}
Bagging Accuracy (train): 1.0000
Bagging Accuracy (cv): 0.8643


In [46]:
# Random forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 4]
}
rf = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
print(f'RF Best Parameters: {grid_search.best_params_}')
print(f'RF Accuracy (train): {accuracy_score(y_train, grid_search.predict(X_train)):.4f}') 
print(f'RF Accuracy (cv): {grid_search.best_score_:.4f}')

Random Forests Training Accuracy: 0.9999
Random Forests Testing Accuracy: 0.8589


In [47]:
# Gradient boosting
param_grid = {
    'learning_rate': [0.1, 0.05, 0.01],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 3, 4],
    'n_estimators': [50, 100, 200]
}
gb = GradientBoostingClassifier()
grid_search = GridSearchCV(estimator=gb, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
print(f'GB Best Parameters: {grid_search.best_params_}')
print(f'GB Accuracy (train): {accuracy_score(y_train, grid_search.predict(X_train)):.4f}')
print(f'GB Accuracy (cv): {grid_search.best_score_:.4f}')

Gradient Boosting Training Accuracy: 0.8712
Gradient Boosting Testing Accuracy: 0.8624


In [20]:
# AdaBoost
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1]
}
ada = AdaBoostClassifier()
grid_search = GridSearchCV(estimator=ada, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
print(f'AdaBoost Best Parameters: {grid_search.best_params_}')
print(f'AdaBoost Accuracy (train): {accuracy_score(y_train, grid_search.predict(X_train)):.4f}')
print(f'AdaBoost Accuracy (cv): {grid_search.best_score_:.4f}')

AdaBoost Best Parameters: {'learning_rate': 1, 'n_estimators': 50}
AdaBoost Accuracy (train): 0.8556
AdaBoost Accuracy (cv): 0.8518


In [21]:
# XGBoost
param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.01, 0.001],
    'n_estimators': [100, 500, 1000]
}
xgb = XGBClassifier()
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
print(f'XGBoost Best Hyperparameters: {grid_search.best_params_}')
print(f'XGBoost Accuracy (train): {accuracy_score(y_train, grid_search.predict(X_train)):.4f}')
print(f'XGBoost Accuracy (cv): {grid_search.best_score_:.4f}')

XGBoost Best Hyperparameters: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 1000}
XGBoost Accuracy (train): 0.8932
XGBoost Accuracy (cv): 0.8662
