In [139]:
import pandas as pd
import numpy as np
import pickle

import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, average_precision_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from xgboost import XGBClassifier

from imblearn.over_sampling import RandomOverSampler

%matplotlib inline

In [20]:
df = pd.read_csv('./dataset_diabetes/diabetic_data.csv')
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_

In [22]:
df.describe()

Unnamed: 0,encounter_id,patient_nbr,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
count,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0
mean,165201600.0,54330400.0,2.024006,3.715642,5.754437,4.395987,43.095641,1.33973,16.021844,0.369357,0.197836,0.635566,7.422607
std,102640300.0,38696360.0,1.445403,5.280166,4.064081,2.985108,19.674362,1.705807,8.127566,1.267265,0.930472,1.262863,1.9336
min,12522.0,135.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
25%,84961190.0,23413220.0,1.0,1.0,1.0,2.0,31.0,0.0,10.0,0.0,0.0,0.0,6.0
50%,152389000.0,45505140.0,1.0,1.0,7.0,4.0,44.0,1.0,15.0,0.0,0.0,0.0,8.0
75%,230270900.0,87545950.0,3.0,4.0,7.0,6.0,57.0,2.0,20.0,0.0,0.0,1.0,9.0
max,443867200.0,189502600.0,8.0,28.0,25.0,14.0,132.0,6.0,81.0,42.0,76.0,21.0,16.0


In [23]:
df['readmitted'].value_counts()

NO     54864
>30    35545
<30    11357
Name: readmitted, dtype: int64

In [24]:
df['admission_type_id'].value_counts()

1    53990
3    18869
2    18480
6     5291
5     4785
8      320
7       21
4       10
Name: admission_type_id, dtype: int64

In [25]:
hospice_or_expirations = [11,13,14,19,20,21]

mask = df['discharge_disposition_id'].isin(hospice_or_expirations)
df = df[~mask]
print(len(df))

99343


In [26]:
df['target'] = (df['readmitted'] == '<30').astype('int')

In [27]:
df = df.replace('?', np.nan)

In [28]:
numerical_columns = ['time_in_hospital','num_lab_procedures', 'num_procedures', 'num_medications',
'number_outpatient', 'number_emergency', 'number_inpatient','number_diagnoses']

In [72]:
df[numerical_columns].isnull().sum()
numerical_df = df[numerical_columns]

In [30]:
categorical_columns = ['age', 'race', 'gender', 
    'max_glu_serum', 'A1Cresult',
    'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
    'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
    'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
    'tolazamide', 'insulin',
    'glyburide-metformin', 'glipizide-metformin',
    'glimepiride-pioglitazone', 'metformin-rosiglitazone',
    'metformin-pioglitazone', 'change', 'diabetesMed','payer_code', 'medical_specialty',
    'admission_type_id', 'discharge_disposition_id', 'admission_source_id']

In [31]:
df[categorical_columns].isnull().sum()

age                             0
race                         2234
gender                          0
max_glu_serum                   0
A1Cresult                       0
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride                     0
acetohexamide                   0
glipizide                       0
glyburide                       0
tolbutamide                     0
pioglitazone                    0
rosiglitazone                   0
acarbose                        0
miglitol                        0
troglitazone                    0
tolazamide                      0
insulin                         0
glyburide-metformin             0
glipizide-metformin             0
glimepiride-pioglitazone        0
metformin-rosiglitazone         0
metformin-pioglitazone          0
change                          0
diabetesMed                     0
payer_code                  39398
medical_specia

In [32]:
df['race'] = df['race'].fillna('UNK')
df['payer_code'] = df['payer_code'].fillna('UNK')
df['medical_specialty'] = df['medical_specialty'].fillna('UNK')

In [33]:
top_specialties = df['medical_specialty'].value_counts().index[:9]

In [34]:
df['specialty'] = df['medical_specialty'].copy()

specialty_mask = df['specialty'].isin(top_specialties)

df['specialty'][~specialty_mask] = 'Other'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['specialty'][~specialty_mask] = 'Other'


In [69]:
encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
all_categorical = categorical_columns + ['specialty']
categorical_values = encoder.fit_transform(df[all_categorical])
encoded_columns = encoder.get_feature_names(all_categorical)

categorical_df = pd.DataFrame(categorical_values, columns=encoded_columns).reset_index(drop=True)
categorical_df.head()

Unnamed: 0,age_[0-10),age_[10-20),age_[20-30),age_[30-40),age_[40-50),age_[50-60),age_[60-70),age_[70-80),age_[80-90),age_[90-100),...,specialty_Cardiology,specialty_Emergency/Trauma,specialty_Family/GeneralPractice,specialty_InternalMedicine,specialty_Nephrology,specialty_Orthopedics,specialty_Orthopedics-Reconstructive,specialty_Other,specialty_Surgery-General,specialty_UNK
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [74]:
scaler = MinMaxScaler()
numerical_df = numerical_df.reset_index(drop=True)
numerical_scaled = scaler.fit_transform(numerical_df)
numerical_df = pd.DataFrame(numerical_scaled, columns=numerical_columns)
numerical_df

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
0,0.000000,0.305344,0.000000,0.0000,0.000000,0.0,0.000000,0.000000
1,0.153846,0.442748,0.000000,0.2125,0.000000,0.0,0.000000,0.533333
2,0.076923,0.076336,0.833333,0.1500,0.047619,0.0,0.047619,0.333333
3,0.076923,0.328244,0.166667,0.1875,0.000000,0.0,0.000000,0.400000
4,0.000000,0.381679,0.000000,0.0875,0.000000,0.0,0.000000,0.266667
...,...,...,...,...,...,...,...,...
99338,0.153846,0.381679,0.000000,0.1875,0.000000,0.0,0.000000,0.533333
99339,0.307692,0.244275,0.500000,0.2125,0.000000,0.0,0.047619,0.533333
99340,0.000000,0.396947,0.000000,0.1000,0.023810,0.0,0.000000,0.800000
99341,0.692308,0.335878,0.333333,0.2500,0.000000,0.0,0.047619,0.533333


In [76]:
all_features_df = pd.concat([numerical_df, categorical_df], axis=1)
all_features_df

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,age_[0-10),age_[10-20),...,specialty_Cardiology,specialty_Emergency/Trauma,specialty_Family/GeneralPractice,specialty_InternalMedicine,specialty_Nephrology,specialty_Orthopedics,specialty_Orthopedics-Reconstructive,specialty_Other,specialty_Surgery-General,specialty_UNK
0,0.000000,0.305344,0.000000,0.0000,0.000000,0.0,0.000000,0.000000,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.153846,0.442748,0.000000,0.2125,0.000000,0.0,0.000000,0.533333,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.076923,0.076336,0.833333,0.1500,0.047619,0.0,0.047619,0.333333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.076923,0.328244,0.166667,0.1875,0.000000,0.0,0.000000,0.400000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.000000,0.381679,0.000000,0.0875,0.000000,0.0,0.000000,0.266667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99338,0.153846,0.381679,0.000000,0.1875,0.000000,0.0,0.000000,0.533333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
99339,0.307692,0.244275,0.500000,0.2125,0.000000,0.0,0.047619,0.533333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
99340,0.000000,0.396947,0.000000,0.1000,0.023810,0.0,0.000000,0.800000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
99341,0.692308,0.335878,0.333333,0.2500,0.000000,0.0,0.047619,0.533333,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [77]:
target = df['target'].reset_index(drop=True)

In [111]:
pca = PCA(0.9)
X_pca = pca.fit_transform(all_features_df)

In [112]:
ros = RandomOverSampler(random_state=42)
X, y = ros.fit_resample(X_pca, target)

In [113]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, random_state=42)

sum(y_train)/len(y_train)

0.5

In [133]:
def evaluate(model, X_test, y_test, threshold=0.5):
    predictions = model.predict_proba(X_test)
    accuracy    = accuracy_score(y_test, predictions[:,1] >= threshold)
    roc_auc     = roc_auc_score(y_test, predictions[:,1])
    precision   = precision_score(y_test, predictions[:,1] >= threshold)
    recall      = recall_score(y_test, predictions[:,1] >= threshold)
    pr_auc      = average_precision_score(y_test, predictions[:,1])
    
    result = pd.DataFrame([[accuracy, precision, recall, roc_auc, pr_auc]], columns=['Accuracy', 'Precision', 'Recall', 'ROC_auc','PR_auc'])
    return result

def run_experiment(X, y, model_class, num_iterations=100, **kwargs):
    results = pd.DataFrame(columns=['Accuracy', 'Precision', 'Recall', 'ROC_auc','PR_auc'])
    for i in range(num_iterations):
        X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=i)
        
        train_y = train_x.pop('ICU')
        test_y  = test_x.pop('ICU')
        
        # Train Model
        model = model_class(**kwargs)
        model.fit(train_x, train_y)
         
        # Evaluate results
        current_result = evaluate(model, test_x, test_y)
        results = results.append(current_result)
        
    return results.reset_index(drop=True)

In [117]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [118]:
evaluate(knn, X_test, y_test)

Unnamed: 0,Accuracy,Precision,Recall,ROC_auc,PR_auc
0,0.757401,0.697459,0.909165,0.845185,0.788028


In [128]:
params = {
    'max_depth': 6, # try 7 (2^7), traing until error_train diverges error_test (error_test will be higher when overfit)
    'n_estimators': 500, # try 50, [500], 1000, 5000, higher = likely to overtrain
    'eta': 0.3,
    "objective": "binary:logistic",
    'nthread': 4
}

xgb = XGBClassifier(**params)

xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=0.3, gamma=0,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=4, nthread=4, num_parallel_tree=1,
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

Unnamed: 0,Accuracy,Precision,Recall,ROC_auc,PR_auc
0,0.900034,0.861555,0.953242,0.967154,0.96454


In [132]:
evaluate(xgb, X_val, y_val)

Unnamed: 0,Accuracy,Precision,Recall,ROC_auc,PR_auc
0,0.902699,0.866347,0.95232,0.966033,0.962415


In [136]:
parameters = {
    'max_depth': [5, 6], # try 7 (2^7), traing until error_train diverges error_test (error_test will be higher when overfit)
    'n_estimators': [50, 500, 1000], # try 50, [500], 1000, 5000, higher = likely to overtrain
    'eta': [0.3, 0.4, 0.5],
    "objective": ["binary:logistic"],
    'nthread': [4]
}

grid_xgb = GridSearchCV(XGBClassifier(), parameters)
grid_xgb.fit(X_train, y_train)

GridSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None, validate_parameter

In [137]:
print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n",grid_xgb.best_estimator_)
print("\n The best score across ALL searched params:\n",grid_xgb.best_score_)
print("\n The best parameters across ALL searched params:\n",grid_xgb.best_params_)

 Results from Grid Search 

 The best estimator across ALL searched params:
 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=0.4, gamma=0,
              gpu_id=-1, importance_type='gain', interaction_constraints='',
              learning_rate=0.400000006, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=1000, n_jobs=4, nthread=4, num_parallel_tree=1,
              random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

 The best score across ALL searched params:
 0.9320522666064186

 The best parameters across ALL searched params:
 {'eta': 0.4, 'max_depth': 6, 'n_estimators': 1000, 'nthread': 4, 'objective': 'binary:logistic'}


In [138]:
xgb = grid_xgb.best_estimator_

In [140]:
pickle.dump(xgb, open('diabetes-readmission.pickle', 'wb'))