In [1]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

from colorama import init as init_colorama
from colorama import Fore

In [2]:
init_colorama()

def color_text(text, color):
    return color + text + Fore.RESET

In [3]:
# test
def df_has_only_numeric_columns(df):
    assert df.apply(lambda s: pd.to_numeric(s, errors='coerce').notnull().all()).all()

In [4]:
def preprocess_data():
    df_tr = pd.read_csv("../data/raw/training_v2.csv")

    df_train = df_tr.copy()

    # drop readmission_status as it has all 0
    df_train.drop('readmission_status', axis=1, inplace=True)
    df_train.drop('encounter_id', axis=1, inplace=True)
    df_train.drop('patient_id', axis=1, inplace=True)
    df_train.drop('hospital_id', axis=1, inplace=True)
    df_train.drop('icu_id', axis=1, inplace=True)

    # 25 genders are nan => replace them with 'M'
    df_train['gender'] = df_train['gender'].fillna(df_train['gender'].value_counts().index[0])
    df_train['gender'] = df_train['gender'].map({'M': 0, 'F': 1})
    
    df_train['ethnicity'] = df_train['ethnicity'].fillna("Other/Unknown")
        
    df_train['hospital_admit_source'] = df_train['hospital_admit_source'].fillna("Other")

    df_train['icu_stay_type'] = df_train['icu_stay_type'].fillna(df_train['icu_stay_type'].value_counts().index[0])
    df_train['icu_stay_type'] = df_train['icu_stay_type'].map({'admit': 0, 'transfer': 1})
    
    # remove outliers
    df_train = df_train[df_train["pre_icu_los_days"] < 88]
    df_train["pre_icu_los_days"] = np.maximum(0, df_train["pre_icu_los_days"])
    
    und_diag = {"Undefined diagnoses": "Undefined Diagnoses"}
    df_train["apache_2_bodysystem"].replace(und_diag, inplace = True)

    cat_columns = ["ethnicity", "hospital_admit_source",
                   "icu_admit_source", "icu_type",
                   "apache_3j_bodysystem", "apache_2_bodysystem"]

    df_train = pd.get_dummies(df_train, columns=cat_columns)

    df_train.fillna(df_train.mean(), inplace=True)


    df_has_only_numeric_columns(df_train)

    return df_train

In [5]:
def evaluate(y_true, scores, preds):
    acc = accuracy_score(y_true, preds)
    precision = precision_score(y_true, preds)
    recall = recall_score(y_true, preds)
    f1 = f1_score(y_true, preds)
    roc_auc = roc_auc_score(y_true, scores)
    auprc = average_precision_score(y_true, scores)
    
    print(color_text("Accuracy:  ", color=Fore.GREEN) +
          color_text("{:.3f}".format(acc), color=Fore.RED))

    print(color_text("Precision: ", color=Fore.GREEN) +
          color_text("{:.3f}".format(precision), color=Fore.RED))

    print(color_text("Recall:    ", color=Fore.GREEN) +
          color_text("{:.3f}".format(recall), color=Fore.RED))

    print(color_text("F1-score:  ", color=Fore.GREEN) +
          color_text("{:.3f}".format(f1), color=Fore.RED))

    print(color_text("ROCAUC:    ", color=Fore.GREEN) +
          color_text("{:.3f}".format(roc_auc), color=Fore.RED))
    
    print(color_text("AUPRC:     ", color=Fore.GREEN) +
          color_text("{:.3f}".format(auprc), color=Fore.RED))

In [6]:
clean_df = preprocess_data()

y = clean_df.hospital_death
X = clean_df.drop(columns="hospital_death")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [7]:
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [8]:
y_true = y_test.values

y_score = rf_clf.predict_proba(X_test)
y_pred = rf_clf.predict(X_test)

evaluate(y_true, y_score[:, 1], y_pred)

Accuracy:  0.930
Precision: 0.746
Recall:    0.291
F1-score:  0.419
ROCAUC:    0.885
AUPRC:     0.544


In [9]:
#Create an object of the classifier.
bbc = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),
                                sampling_strategy='auto',
                                max_samples=0.4, # 0.4, 0.5
                                n_estimators=100, # try
                                bootstrap=False, # try
                                replacement=False,
                                random_state=0)


#Train the classifier.
bbc.fit(X_train, y_train)

BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                                class_weight=None,
                                                                criterion='gini',
                                                                max_depth=None,
                                                                max_features=None,
                                                                max_leaf_nodes=None,
                                                                min_impurity_decrease=0.0,
                                                                min_impurity_split=None,
                                                                min_samples_leaf=1,
                                                                min_samples_split=2,
                                                                min_weight_fraction_leaf=0.0,
                                                                p

In [10]:
# 0.4
y_score_bbc = bbc.predict_proba(X_test)
y_pred_bbc = bbc.predict(X_test)

evaluate(y_true, y_score_bbc[:, 1], y_pred_bbc)

Accuracy:  0.799
Precision: 0.275
Recall:    0.814
F1-score:  0.411
ROCAUC:    0.885
AUPRC:     0.459


In [11]:
gb_clf = GradientBoostingClassifier()
gb_clf.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [12]:
y_score_gb = gb_clf.predict_proba(X_test)
y_pred_gb = gb_clf.predict(X_test)

evaluate(y_true, y_score_gb[:, 1], y_pred_gb)

Accuracy:  0.931
Precision: 0.702
Recall:    0.343
F1-score:  0.461
ROCAUC:    0.895
AUPRC:     0.563
