# Best Model Submission - Extra Trees
Please check out README.md in our GitHub repo for detailed descriptions: https://github.com/zxyao5148/STAT3612_2023_1A_GroupProject/blob/main/README.md

In [1]:
## load libraries
#!pip install scikit-optimize
import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
import zipfile
import urllib.request
import io
import warnings
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, KFold 
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
from skopt import BayesSearchCV, plots
from skopt.space import Real, Categorical, Integer
warnings.filterwarnings('ignore')

In [2]:
## load processed datasets
# train set
train_median = pd.read_csv('../data_processing/train/train_valid_mean_mode.csv')
train_latest = pd.read_csv('../data_processing/train/train_valid_latest.csv')
train_dispers = pd.read_csv('../data_processing/train/train_valid_entropy_std.csv')
train_min = pd.read_csv('../data_processing/train/train_valid_min.csv')
train_max = pd.read_csv('../data_processing/train/train_valid_max.csv')
train_latest = train_latest.drop(train_latest.columns[0:13], axis=1)
train_dispers = train_dispers.drop(train_dispers.columns[0:13], axis=1)
train_min = train_min.drop(train_min.columns[0:13], axis=1)
train_max = train_max.drop(train_max.columns[0:13], axis=1)
train = pd.concat([train_median, train_latest, train_dispers, train_min, train_max], axis=1)

# load processed data
test_median = pd.read_csv('../data_processing/test/test_mean_mode.csv')
test_latest = pd.read_csv('../data_processing/test/test_latest.csv')
test_dispers = pd.read_csv('../data_processing/test/test_entropy_std.csv')
test_min = pd.read_csv('../data_processing/test/test_min.csv')
test_max = pd.read_csv('../data_processing/test/test_max.csv')
test_latest = test_latest.drop(test_latest.columns[0:12], axis=1)
test_dispers = test_dispers.drop(test_dispers.columns[0:12], axis=1)
test_min = test_min.drop(test_min.columns[0:12], axis=1)
test_max = test_max.drop(test_max.columns[0:12], axis=1)
test = pd.concat([test_median, test_latest, test_dispers, test_min, test_max], axis=1)

In [3]:
# separate response and features
X_train = train.drop(train.columns[0:13], axis=1)
y_train = train["readmitted_within_30days"]
X_test = test.drop(test.columns[0:12], axis=1)

In [4]:
# Extra Trees model (best performanc in kaggle submissions)
# hyperparameter tuning - Bayesian optimization
params = {
    'n_estimators': Integer(1, 2000),
    'min_samples_split': Integer(2, 20),
    'min_samples_leaf': Integer(1, 20),
    'max_depth': Integer(1, 2000),
}

extra_trees = ExtraTreesClassifier(class_weight="balanced", random_state=42)
opt = BayesSearchCV(extra_trees, params, n_iter=50, cv=5, scoring='roc_auc', n_jobs=-1, random_state=42)
opt.fit(X_train, y_train)

best_params_extra_trees = opt.best_params_
best_auc_extra_trees = opt.best_score_

print("Best params:", best_params_extra_trees)
print("Best AUC: %.4f" % best_auc_extra_trees)

Best params: OrderedDict([('max_depth', 1135), ('min_samples_leaf', 2), ('min_samples_split', 2), ('n_estimators', 1614)])
Best AUC: 0.8017


In [None]:
# make predictions
y_pred = opt.predict_proba(X_test)
y_pred = y_pred[:,1]  # get the probabilities of the positive class
df = pd.DataFrame({'id': test['id'], 'readmitted_within_30days': y_pred})
df.to_csv('./prediction.csv', index=False)