In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import os

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold, train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier


import xgboost as xgb
from xgboost import XGBClassifier

import re

import sys
sys.path.append(os.path.abspath('..'))

from util import evaluate_model_performance, evaluate_model_fairness

In [2]:
data_path = Path(os.getcwd()).parent.parent / "data" / "dataset_diabetes"
df = pd.read_csv(data_path / "diabetic_preprocessed.csv")

In [3]:
df["age"] = df["age_all"]

columns_to_remove = ['encounter_id', 'patient_nbr', 'readmitted', 'readmit_binary', 'diabetes_type', \
    'had_emergency', 'had_inpatient_days', 'had_outpatient_days', 'race_all', 'age_all']

df_for_experimenting = df.drop(columns=columns_to_remove)

In [4]:
target_variable = "readmit_30_days"
Y= df_for_experimenting.loc[:, target_variable]
X = pd.get_dummies(df_for_experimenting.drop(columns=["readmit_30_days"]))

In [5]:
regex = re.compile(r"\[|\]|<", re.IGNORECASE)

X.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X.columns.values]

In [6]:
random_seed = 445
np.random.seed(random_seed)

df_train, df_test, X_train, X_test, Y_train, Y_test = train_test_split(
    df_for_experimenting,
    X,
    Y,
    test_size=0.20,
    stratify=Y,
    random_state=random_seed
)

### XGBoosted tree for classification

In [7]:
xgb = XGBClassifier(objective='binary:hinge', silent=True, nthread=-1)

# A parameter grid for XGBoost
params = {
    'booster': ['gbtree', 'dart'],
    'learning_rate': [0.5, 0.3, 0.1, 0.05, 0.01, 0.001],
    'gamma': [0, 0.5, 1, 1.5, 2, 5],
    'max_depth': [4, 5, 6],
    'min_child_weight': [1, 5, 10],    
    'subsample': [0.5, 0.75, 1.0],
    'colsample_bytree': [0.5, 0.75, 1.0]    
}

random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=params,
    cv=StratifiedKFold(n_splits=3),
    n_jobs=-1,
    scoring='balanced_accuracy',
    verbose=3,
    n_iter=500
)

random_search.fit(X_train, Y_train)

Fitting 3 folds for each of 500 candidates, totalling 1500 fits
Parameters: { "silent" } are not used.



In [8]:
print('Best score:')
print(random_search.best_score_)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
results.to_csv('xgb-random-grid-search-results-01.csv', index=False)

Best score:
0.5299260038095884

 Best hyperparameters:
{'subsample': 0.75, 'min_child_weight': 10, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 1.0, 'booster': 'gbtree'}


In [9]:
xgb_best = XGBClassifier(
    subsample=0.75,
    min_child_weight=10,
    max_depth=6,
    learning_rate=0.01,
    gamma=1,
    colsample_bytree=1.0,
    booster='gbtree',
    objective='binary:hinge',
    silent=True,
    nthread=-1
)

xgb_best.fit(X_train, Y_train)

Parameters: { "silent" } are not used.



In [10]:
Y_pred = xgb_best.predict(X_test)

In [11]:
evaluate_model_performance(Y_test, Y_pred)
evaluate_model_fairness(Y_test, Y_pred, df_test['race'])

The accuracy score for the testing data: 0.8806564142878199
The precision score for the testing data: 0.3145539906103286
The recall score for the testing data: 0.05900484368119771
The F1 score for the testing data: 0.09936967000370782
The F2 score for the testing data: 0.0704521556256572
Specificity score for the testing data: 0.9838513438778896
The balanced accuracy score for the testing data: 0.5214280937795437
The G mean score for the testing data: 0.24093981562010705
[[17790   292]
 [ 2137   134]]
The Demographic parity difference score for the testing data: 0.024678663239074552
The Equalized odds difference score for the testing data: 0.08974358974358974
The Equal opportunity difference score for the testing data: 0.05900484368119771


  _warn_prf(average, modifier, msg_start, len(result))


### Histogram-based Gradient Boosting Classification Tree

In [12]:
hist_gb = HistGradientBoostingClassifier()

hist_gb.fit(X_train, Y_train)
Y_pred = hist_gb.predict(X_test)

evaluate_model_performance(Y_test, Y_pred)
evaluate_model_fairness(Y_test, Y_pred, df_test['race'])

The accuracy score for the testing data: 0.8880754679899769
The precision score for the testing data: 0.4222222222222222
The recall score for the testing data: 0.00836635843240863
The F1 score for the testing data: 0.016407599309153715
The F2 score for the testing data: 0.010406397195749807
Specificity score for the testing data: 0.9985621059617299
The balanced accuracy score for the testing data: 0.5034642321970693
The G mean score for the testing data: 0.0914020158174678
[[18056    26]
 [ 2252    19]]
The Demographic parity difference score for the testing data: 0.0038560411311053984
The Equalized odds difference score for the testing data: 0.011086474501108648
The Equal opportunity difference score for the testing data: 0.00836635843240863


  _warn_prf(average, modifier, msg_start, len(result))


### Random forest

In [13]:
random_forest = RandomForestClassifier()

random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)

evaluate_model_performance(Y_test, Y_pred)
evaluate_model_fairness(Y_test, Y_pred, df_test['race'])

The accuracy score for the testing data: 0.8882228664079006
The precision score for the testing data: 0.4
The recall score for the testing data: 0.0035226772346983706
The F1 score for the testing data: 0.006983849847228284
The F2 score for the testing data: 0.004393673110720562
Specificity score for the testing data: 0.9993363565977215
The balanced accuracy score for the testing data: 0.50142951691621
The G mean score for the testing data: 0.05933244840045966
[[18070    12]
 [ 2263     8]]
The Demographic parity difference score for the testing data: 0.0024125452352231603
The Equalized odds difference score for the testing data: 0.02564102564102564
The Equal opportunity difference score for the testing data: 0.0035226772346983706


  _warn_prf(average, modifier, msg_start, len(result))


Experimentation with feature selection

In [14]:
from sklearn.feature_selection import RFECV

selector = RFECV(random_forest, step=1, cv=5)
selector = selector.fit(X_train, Y_train)
selector.cv_results_

{'mean_test_score': array([0.88831839, 0.88557917, 0.86886132, 0.86076649, 0.86356713,
        0.87027392, 0.87454858, 0.87779143, 0.88152561, 0.88342955,
        0.88507554, 0.88614421, 0.88664783, 0.88731114, 0.88776563,
        0.88790075, 0.88780248, 0.88831839, 0.88829382, 0.88834296,
        0.88837981, 0.88839209, 0.88850264, 0.88851492, 0.88853949,
        0.88850264, 0.88867461, 0.88853949, 0.88851492, 0.88837981,
        0.88862548, 0.88853949, 0.88856406, 0.88849036, 0.88847807,
        0.88853949, 0.88857634, 0.88857634, 0.88839209, 0.88844122,
        0.88849036, 0.88845351, 0.88845351, 0.88842894, 0.88852721,
        0.88842894, 0.88844122, 0.88844122, 0.88860091, 0.88862548,
        0.88856406, 0.88846579, 0.88850264, 0.88851492, 0.88858863,
        0.88850264, 0.88856406, 0.88855177, 0.88868689, 0.88844122,
        0.88850264, 0.88849036, 0.88851492, 0.88857634, 0.88850264,
        0.88865004, 0.88852721, 0.88861319, 0.88852721, 0.88861319,
        0.88861319, 0.8886254

In [15]:
X_train_selected_features = selector.transform(X_train)
X_test_selected_features = selector.transform(X_test)

In [16]:
random_forest.fit(X_train_selected_features, Y_train)
Y_pred = random_forest.predict(X_test_selected_features)

evaluate_model_performance(Y_test, Y_pred)
evaluate_model_fairness(Y_test, Y_pred, df_test['race'])

The accuracy score for the testing data: 0.888615928855697
The precision score for the testing data: 0.5714285714285714
The recall score for the testing data: 0.007045354469396741
The F1 score for the testing data: 0.013919095258808177
The F2 score for the testing data: 0.008779631255487268
Specificity score for the testing data: 0.9993363565977215
The balanced accuracy score for the testing data: 0.5031908555335591
The G mean score for the testing data: 0.08390875321673191
[[18070    12]
 [ 2255    16]]
The Demographic parity difference score for the testing data: 0.0024125452352231603
The Equalized odds difference score for the testing data: 0.02564102564102564
The Equal opportunity difference score for the testing data: 0.007045354469396741


  _warn_prf(average, modifier, msg_start, len(result))
