In [1]:
import sys
import os

sys.path.append("..")

In [10]:
import pickle
import json

import pandas as pd

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV

from xgboost import XGBClassifier

from utils import evaluate_model

In [12]:
data = pd.read_csv("../data/raw/Chapter_3_Diabetes_data.csv", low_memory=False)

In [13]:
data["race"] = data["race"].fillna("Other")
data["medical_specialty"] = data["medical_specialty"].fillna("NA")
data["payer_code"] = data["payer_code"].fillna("NA")

age_mapping = {
    "[0-10)": 10,
    "[10-20)": 20,
    "[20-30)": 30,
    "[30-40)": 40,
    "[40-50)": 50,
    "[50-60)": 60,
    "[60-70)": 70,
    "[70-80)": 80,
    "[80-90)": 90,
    "[90-100)": 100
}
data["age"] = data["age"].replace(age_mapping)

categorical = [
    "encounter_id",
    "patient_nbr",
    "admission_type_id",
    "discharge_disposition_id",
    "admission_source_id"
]
for variable in categorical:
    data[variable] = pd.Categorical(data[variable])

In [14]:
data.drop_duplicates(inplace=True)

In [15]:
rows_per_patient = data.groupby('patient_nbr')["encounter_id"].count()
data = data.merge(rows_per_patient, on='patient_nbr', suffixes=("", "_count"))
data["running_count"] = data.groupby("patient_nbr").cumcount()

data["target"] = data["readmitted"] == "<30"

In [16]:
data.set_index("encounter_id", inplace=True)

In [17]:
not_train_columns = [
           "patient_nbr",
           "payer_code",
           "medical_specialty",
           "weight",
           "diag_1",
           "diag_2",
           "diag_3",
           "change",
           "examide",
           "citoglipton",
           "diabetesMed",
           "readmitted",
           "running_count",
           "encounter_id_count",
           "target"]


data = data.sort_values(by="encounter_id")

In [18]:
x_train, x_test, y_train, y_test = train_test_split(
    data.drop(not_train_columns, axis=1),
    data["target"],
    shuffle=False,
    test_size=0.20)

# categorical_features = np.where(x_train.dtypes != "int64")[0]

cat = [t != "int64" for t in x_train.dtypes]
num = [t == "int64" for t in x_train.dtypes]

cat_names = x_train.columns[cat]
num_names = x_train.columns[num]


transformer = ColumnTransformer(
     [("num", StandardScaler(), num),
      ("cat",  OneHotEncoder(handle_unknown="ignore"), cat)],
)

x_train = transformer.fit_transform(x_train)
x_test = transformer.transform(x_test)

cat_names = transformer.transformers_[1][1].get_feature_names(cat_names)

all_feature_names = list(num_names)
all_feature_names.extend(cat_names)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  res = transformer.transform(X)


In [19]:
model = XGBClassifier()

In [20]:
params1 = {
    'n_estimators': [10, 100, 500, 1000]
}

gsearch1 = GridSearchCV(estimator=model,
                        param_grid=params1,
                        scoring='roc_auc',
                        n_jobs=4,
                        iid=False,
                        cv=5,
                        verbose=1)

gsearch1.fit(x_train, y_train)
gsearch1.best_params_, gsearch1.best_score_

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed: 20.4min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
       fit_params=None, iid=False, n_jobs=4,
       param_grid={'n_estimators': [10, 100, 500, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [None]:
model = XGBClassifier(
n_estimators=100)

In [23]:
params2 = {
    'max_depth': range(3, 10, 2),
    'min_child_weight': range(1, 6, 2)
}

gsearch2 = GridSearchCV(estimator=model,
                        param_grid=params2,
                        scoring='roc_auc',
                        n_jobs=4,
                        iid=False,
                        cv=5,
                        verbose=1)

gsearch2.fit(x_train, y_train)
gsearch2.best_params_, gsearch1.best_score_

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed: 10.0min
[Parallel(n_jobs=4)]: Done  60 out of  60 | elapsed: 15.4min finished


({'max_depth': 5, 'min_child_weight': 3}, 0.6705384614097077)

In [None]:
model = XGBClassifier(
    max_depth=5,
    n_estimators=100,
    min_child_weight=3)

In [24]:
params3 = {
    'subsample':[i/10.0 for i in range(6, 10)],
    'colsample_bytree':[i/10.0 for i in range(6, 10)]
}

gsearch3 = GridSearchCV(estimator=model,
                        param_grid=params3,
                        scoring='roc_auc',
                        n_jobs=4,
                        iid=False,
                        cv=5,
                        verbose=1)

gsearch3.fit(x_train, y_train)
gsearch3.best_params_, gsearch1.best_score_

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  4.9min
[Parallel(n_jobs=4)]: Done  80 out of  80 | elapsed: 10.2min finished


({'colsample_bytree': 0.7, 'subsample': 0.7}, 0.6705384614097077)

In [31]:
params4 =  {
 'subsample':[i/100.0 for i in range(58, 73, 5)],
 'colsample_bytree':[i/100.0 for i in range(58, 73, 5)]
}

gsearch4 = GridSearchCV(estimator=model,
                        param_grid=params4,
                        scoring='roc_auc',
                        n_jobs=4,
                        iid=False,
                        cv=5,
                        verbose=1)

gsearch4.fit(x_train, y_train)
gsearch4.best_params_, gsearch1.best_score_

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  45 out of  45 | elapsed: 12.6min finished


({'colsample_bytree': 0.68, 'subsample': 0.63}, 0.6705384614097077)