In [21]:
import numpy as np
import pandas as pd
from pathlib import Path
import os

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold, train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier


import xgboost as xgb
from xgboost import XGBClassifier

import re

import sys
sys.path.append(os.path.abspath('..'))

from util import evaluate_model_performance, evaluate_model_fairness

In [2]:
data_path = Path(os.getcwd()).parent.parent / "data" / "dataset_diabetes"
df = pd.read_csv(data_path / "diabetic_preprocessed.csv")

In [4]:
df["age"] = df["age_all"]

columns_to_remove = ['encounter_id', 'patient_nbr', 'readmitted', 'readmit_binary', 'diabetes_type', \
    'had_emergency', 'had_inpatient_days', 'had_outpatient_days', 'race_all', 'age_all']

df_for_experimenting = df.drop(columns=columns_to_remove)

In [5]:
target_variable = "readmit_30_days"
Y= df_for_experimenting.loc[:, target_variable]
X = pd.get_dummies(df_for_experimenting.drop(columns=["readmit_30_days"]))

In [6]:
regex = re.compile(r"\[|\]|<", re.IGNORECASE)

X.columns = [regex.sub("_", col) if any(x in str(col) for x in set(('[', ']', '<'))) else col for col in X.columns.values]

In [7]:
random_seed = 445
np.random.seed(random_seed)

df_train, df_test, X_train, X_test, Y_train, Y_test = train_test_split(
    df_for_experimenting,
    X,
    Y,
    test_size=0.20,
    stratify=Y,
    random_state=random_seed
)

### XGBoosted tree for classification

In [10]:
xgb = XGBClassifier(objective='binary:hinge', silent=True, nthread=-1)

# A parameter grid for XGBoost
params = {
    'booster': ['gbtree', 'dart'],
    'learning_rate': [0.5, 0.3, 0.1, 0.05, 0.01, 0.001],
    'gamma': [0, 0.5, 1, 1.5, 2, 5],
    'max_depth': [4, 5, 6],
    'min_child_weight': [1, 5, 10],    
    'subsample': [0.5, 0.75, 1.0],
    'colsample_bytree': [0.5, 0.75, 1.0]    
}

random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=params,
    cv=StratifiedKFold(n_splits=3),
    n_jobs=-1,
    scoring='balanced_accuracy',
    verbose=3,
    n_iter=500
)

random_search.fit(X_train, Y_train)

Fitting 3 folds for each of 500 candidates, totalling 1500 fits
Parameters: { "silent" } are not used.



In [14]:
print('Best score:')
print(random_search.best_score_)
print('\n Best hyperparameters:')
print(random_search.best_params_)
results = pd.DataFrame(random_search.cv_results_)
results.to_csv('xgb-random-grid-search-results-01.csv', index=False)

Best score:
0.5299260038095884

 Best hyperparameters:
{'subsample': 0.75, 'min_child_weight': 10, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 1, 'colsample_bytree': 1.0, 'booster': 'gbtree'}


In [15]:
xgb_best = XGBClassifier(
    subsample=0.75,
    min_child_weight=10,
    max_depth=6,
    learning_rate=0.01,
    gamma=1,
    colsample_bytree=1.0,
    booster='gbtree',
    objective='binary:hinge',
    silent=True,
    nthread=-1
)

xgb_best.fit(X_train, Y_train)

Parameters: { "silent" } are not used.



In [16]:
Y_pred = xgb_best.predict(X_test)

In [17]:
evaluate_model_performance(Y_test, Y_pred)
evaluate_model_fairness(Y_test, Y_pred, df_test['race'])

The balanced accuracy score for the testing data: 0.5214280937795437
The precision score for the testing data: 0.3145539906103286
The recall score for the testing data: 0.05900484368119771
The F1 score for the testing data: 0.09936967000370782
The F2 score for the testing data: 0.0704521556256572
The G mean score for the testing data: 0.24093981562010705
The Demographic parity difference score for the testing data: 0.024678663239074552
The Equalized odds difference score for the testing data: 0.08974358974358974
The Equal opportunity difference score for the testing data: 0.05900484368119771


  _warn_prf(average, modifier, msg_start, len(result))


### Histogram-based Gradient Boosting Classification Tree

In [19]:
hist_gb = HistGradientBoostingClassifier()

hist_gb.fit(X_train, Y_train)
Y_pred = hist_gb.predict(X_test)

evaluate_model_performance(Y_test, Y_pred)
evaluate_model_fairness(Y_test, Y_pred, df_test['race'])

The balanced accuracy score for the testing data: 0.5034642321970693
The precision score for the testing data: 0.4222222222222222
The recall score for the testing data: 0.00836635843240863
The F1 score for the testing data: 0.016407599309153715
The F2 score for the testing data: 0.010406397195749807
The G mean score for the testing data: 0.0914020158174678
The Demographic parity difference score for the testing data: 0.0038560411311053984
The Equalized odds difference score for the testing data: 0.011086474501108648
The Equal opportunity difference score for the testing data: 0.00836635843240863


  _warn_prf(average, modifier, msg_start, len(result))


### Random forest

In [22]:
random_forest = RandomForestClassifier()

random_forest.fit(X_train, Y_train)
Y_pred = random_forest.predict(X_test)

evaluate_model_performance(Y_test, Y_pred)
evaluate_model_fairness(Y_test, Y_pred, df_test['race'])

The balanced accuracy score for the testing data: 0.50142951691621
The precision score for the testing data: 0.4
The recall score for the testing data: 0.0035226772346983706
The F1 score for the testing data: 0.006983849847228284
The F2 score for the testing data: 0.004393673110720562
The G mean score for the testing data: 0.05933244840045966
The Demographic parity difference score for the testing data: 0.0024125452352231603
The Equalized odds difference score for the testing data: 0.02564102564102564
The Equal opportunity difference score for the testing data: 0.0035226772346983706


  _warn_prf(average, modifier, msg_start, len(result))
