# Imports

In [1]:
import os
import base64
import joblib
import pandas as pd
import numpy as np
import category_encoders as ce
import json
import joblib
import pickle
import math
import requests
from copy import deepcopy
import seaborn as sns
from uuid import uuid4

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV
from category_encoders import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score

from sklearn.base import BaseEstimator, TransformerMixin

from pandas_profiling import ProfileReport

from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
plt.style.use('ggplot')
%matplotlib inline

# Custom transformer imports

In [2]:
# WIP preprocessor which assigns categories and compresses some code
from custom_transformers.preprocessor import ColumnConverter

# WIP ordinal category preprocessor
from custom_transformers.custom_ordinal_encoder import custom_oe

# Functions

In [3]:
def load_train_data():
    df = pd.read_csv(os.path.join("data", "train_data.csv"))
    return df

data_train = load_train_data()

In [4]:
def assign_index(df, indexcol: str):
    _df = df.copy()
    _df = _df.set_index(indexcol)
    return _df

In [5]:
def assign_categorical(df, column_list: []):
    _df = df.copy()
    for column in column_list:
        _df[column] = _df[column].astype('category')
    return _df

In [6]:
def encode_unordered_categorical(df, column_list: []):
    _df = df.copy()
    ohe = ce.OneHotEncoder(verbose=1,
                           cols=column_list,
                           handle_missing="indicator",
                           use_cat_names=True)
    _df = ohe.fit_transform(_df)
    return _df

In [7]:
def binarymap(df):
    _df = df.copy()
    _df = _df.assign(has_prosthesis = _df['has_prosthesis'].map({True: 1, False: 0}),
                     blood_transfusion = _df['blood_transfusion'].map({True: 1, False: 0}),
                     diuretics = _df['diuretics'].map({'Yes': 1, 'No': 0}),
                     insulin = _df['insulin'].map({'Yes': 1, 'No': 0}),
                     change = _df['change'].map({'Ch': 1, 'No': 0}),
                     diabetesMed = _df['diabetesMed'].map({'Yes': 1, 'No': 0}))
    return _df

In [8]:
def process_ordered_cats(df):
    _df = df.copy()
    # some string operations
    _df['age'] = _df['age'].str.lstrip("\[").str.rstrip(")")
    _df['weight'] = _df['weight'].str.lstrip("\[").str.rstrip(")")
    _df['max_glu_serum'] = _df['max_glu_serum'].str.lower()
    _df['A1Cresult'] = _df['A1Cresult'].str.lower()
    
    #making it categorical
    _df = assign_categorical(_df, ['age', 'weight', 'max_glu_serum', 'A1Cresult'])
    
    #setting categories and replacing missing or unexpected values with 'unknown'
    ordered_age = ['unknown',
                   '0-10',
                   '10-20',
                   '20-30',
                   '30-40',
                   '40-50',
                   '50-60',
                   '60-70',
                   '70-80',
                   '80-90',
                   '90-100']
    _df = _df.assign(age=_df['age'].cat.set_categories(ordered_age, ordered=True))
    _df.loc[~(_df['age'].isin(ordered_age)), 'age'] = 'unknown'
    
    ordered_weight = ['unknown',
                      '0-25',
                      '25-50',
                      '50-75',
                      '75-100',
                      '100-125',
                      '125-150',
                      '150-175',
                      '175-200',
                      '>200']
    _df = _df.assign(weight=_df['weight'].cat.set_categories(ordered_weight, ordered=True))
    _df.loc[~(_df['weight'].isin(ordered_weight)), 'weight'] = 'unknown'
    
    ordered_max_glu = ['unknown',
                       'norm',
                       '>200',
                       '>300']
    _df = _df.assign(max_glu_serum=_df['max_glu_serum'].cat.set_categories(ordered_max_glu, ordered=True))
    _df.loc[~(_df['max_glu_serum'].isin(ordered_max_glu)), 'max_glu_serum'] = 'unknown'

    ordered_A1C = ['unknown',
                   'norm',
                   '>7',
                   '>8']
    _df = _df.assign(A1Cresult=_df['A1Cresult'].cat.set_categories(ordered_A1C, ordered=True))
    _df.loc[~(_df['A1Cresult'].isin(ordered_A1C)), 'A1Cresult'] = 'unknown'
    
    return _df


In [9]:
def build_target(df, target: str):
    _df = df.copy()
    _df[target] = np.where(_df[target]== 'Yes', True, False)
    return _df

# Custom Transformers and pipeline objects

In [10]:
class DroppingColumns(BaseEstimator, TransformerMixin):
    def __init__(self, cols=[]):
        self.cols = cols
    def fit(self, X=None, y=None, **fit_params):
        return self
    def transform(self, data):
        X = data.copy()
        X = X.drop(self.cols,axis=1)
        return X

In [11]:
class SelectColumns(BaseEstimator, TransformerMixin):
    def __init__(self, cols=[]):
        self.cols = cols
    def fit(self, X=None, y=None, **fit_params):
        return self
    def transform(self, data):
        X = data.copy()
        X = X[self.cols]
        return X

In [12]:
class BinaryProcessor(BaseEstimator, TransformerMixin):
    def fit(self, X=None, y=None, **fit_params):
        return self
    def transform(self, data):
        X = data.copy()
        X = binarymap(X)
        return X

In [13]:
class AssignCategorical(BaseEstimator, TransformerMixin):
    def __init__(self, cols=[]):
        self.cols = cols
    def fit(self, X=None, y=None, **fit_params):
        return self
    def transform(self, data):
        X = data.copy()
        X = assign_categorical(X, column_list=self.cols)
        return X

In [14]:
class ordinal_cat_prep(BaseEstimator, TransformerMixin):
      
    def fit(self, X=None, y=None, **fit_params):
        return self
    def transform(self, data):
        X = data.copy()
        X = process_ordered_cats(X)
        
        orde = ce.OrdinalEncoder(verbose=1,
                        cols=['age',
                               'weight',
                               'max_glu_serum',
                               'A1Cresult',
                               'complete_vaccination_status'],
                         handle_unknown='value',
                         handle_missing='value',
                         mapping = [{'col': 'age', 'mapping': {'0-10':1,
                                                               '10-20':2,
                                                               '20-30':3,
                                                               '30-40':4,
                                                               '40-50':5,
                                                               '50-60':6,
                                                               '60-70':7,
                                                               '70-80':8,
                                                               '80-90':9,
                                                               '90-100':10}},
        
                                    {'col': 'weight', 'mapping': {'0-25':1,
                                                                  '25-50':2, 
                                                                  '50-75':3,
                                                                  '75-100':4,
                                                                  '100-125':5, 
                                                                  '125-150':6, 
                                                                  '150-175':7, 
                                                                  '175-200':8, 
                                                                  '>200':9}},
                                    
                                    {'col': 'max_glu_serum', 'mapping': {'norm':1,
                                                                         '>200':2,
                                                                         '>300':3,}},
                                    
                                    {'col': 'A1Cresult', 'mapping': {'norm':1,
                                                                         '>7':2,
                                                                         '>8':3,}},
                                   
                                    {'col': 'complete_vaccination_status', 'mapping': {'Complete':1,
                                                                                       'Incomplete':0}}])
                                          
        return orde.fit_transform(X)

In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Simple baseline

In [15]:
target = 'readmitted'
index_col = 'admission_id'

In [16]:
data_train = load_train_data()
data_train = assign_index(data_train, index_col)
data_train = build_target(data_train, target)

In [17]:
#List of all features (n=numerical, uc=unordered actegory, oc=ordered category, b=boolean)

all_features = ['patient_id', #identifier
                'race', #uc
                'gender', #uc
                'age', #oc
                'weight', #oc               
                'admission_type_code', #uc
                'discharge_disposition_code', #uc
                'admission_source_code', #uc
                'time_in_hospital', #n
                'payer_code', #uc
                'medical_specialty', #uc
                'has_prosthesis', #b
                'complete_vaccination_status', #uc
                'num_lab_procedures', #n
                'num_procedures', #n
                'num_medications', #n
                'number_outpatient', #n
                'number_emergency', #n
                'number_inpatient', #n
                'diag_1', #uc
                'diag_2', #uc
                'diag_3', #uc
                'number_diagnoses', #n
                'blood_type', #uc
                'hemoglobin_level', #n
                'blood_transfusion', #b
                'max_glu_serum', #oc
                'A1Cresult', #oc
                'diuretics', #b
                'insulin', #b
                'change', #b
                'diabetesMed'] #b]

num_features = ['time_in_hospital', 
                'num_lab_procedures',
                'num_procedures',
                'num_medications',
                'number_outpatient',
                'number_emergency',
                'number_inpatient',
                'number_diagnoses',
                'hemoglobin_level']

bool_features = ['has_prosthesis',
                 'blood_transfusion',
                 'diuretics',
                 'insulin',
                 'change',
                 'diabetesMed']

cat_features = ['race',
                'gender',
                'admission_type_code', 
                'discharge_disposition_code',
                'admission_source_code', 
                'payer_code',
                'medical_specialty',
                'diag_1',
                'diag_2',
                'diag_3',
                'blood_type']

ord_cat_features = ['age',
                    'weight',
                    'max_glu_serum',
                    'A1Cresult',
                    'complete_vaccination_status',]

# missing features: date of admission and date of release

In [18]:
# Convert target to boolean
df_train, df_test = train_test_split(data_train, test_size=0.2, random_state=42, stratify=data_train[target])
X_train = df_train.drop(target, axis=1)
y_train = df_train[target]
X_test = df_test.drop(target, axis=1)
y_test = df_test[target]

In [19]:
selected_features = ['time_in_hospital', 
                     'num_lab_procedures',
                     'num_procedures',
                     'num_medications',
                     'number_outpatient',
                     'number_emergency',
                     'number_inpatient',
                     'number_diagnoses',
                     'hemoglobin_level',
                     
                     'has_prosthesis',
                     'blood_transfusion',
                     'diuretics',
                     'insulin',
                     'change',
                     'diabetesMed',
                     
                     'race',
                     'gender',
                     'admission_type_code', 
                     'discharge_disposition_code',
                     'admission_source_code', 
                     'payer_code',
                     'medical_specialty',
                     'complete_vaccination_status',
                     'blood_type',
                     
                     'max_glu_serum',
                     'A1Cresult',
                     'age',
                     'weight']

In [20]:
num_feats = [feat for feat in selected_features if feat in num_features]
cat_feats = [feat for feat in selected_features if feat in cat_features+bool_features+ord_cat_features]
bool_feats = [feat for feat in selected_features if feat in bool_features]
ord_cat_feats = [feat for feat in selected_features if feat in ord_cat_features]

# Pipeline 1 - just numerical features

In [21]:
num_features = ['time_in_hospital', 
                'num_lab_procedures',
                'num_procedures',
                'num_medications',
                'number_outpatient',
                'number_emergency',
                'number_inpatient',
                'number_diagnoses',
                'hemoglobin_level']

In [22]:
pipeline = make_pipeline(
    ColumnConverter(),
    SelectColumns(cols=num_feats),
    SimpleImputer(strategy="median"),
    RandomForestClassifier(max_depth=10, class_weight="balanced", random_state=42, n_jobs=-1),
)
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

# To use on other cells
print(f1_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(precision_score(y_test, y_pred))

0.23879680785758134
0.4288864388092613
0.16546150574223734


# Pipeline 2 - adding the binary features

In [23]:
bool_features = ['has_prosthesis',
                 'blood_transfusion',
                 'diuretics',
                 'insulin',
                 'change',
                 'diabetesMed']

In [24]:
pipeline = make_pipeline(
    ColumnConverter(),
    SelectColumns(cols=num_feats+bool_feats),
    SimpleImputer(strategy="median"),
    RandomForestClassifier(max_depth=10, class_weight="balanced", random_state=42, n_jobs=-1),
)
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

# To use on other cells
print(f1_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(precision_score(y_test, y_pred))

0.24461091576211585
0.4410143329658214
0.16924053310767928


# Pipeline 3 - adding ordered categorical variables

1. added age, weight, max_glu_serum, A1Cresult, and also encoded vaccination status here
2. also did some playing around with the model options for fun

In [25]:
ord_cat_features = ['age',
                    'weight',
                    'max_glu_serum',
                    'A1Cresult',
                    'complete_vaccination_status',]

In [26]:
pipeline = make_pipeline(
    ColumnConverter(),
    custom_oe(),
    SelectColumns(cols=num_feats+bool_feats+ord_cat_feats),
    SimpleImputer(strategy="median"),
    RandomForestClassifier(max_depth=10, class_weight="balanced", random_state=42, n_jobs=-1),
)
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

# To use on other cells
print(f1_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(precision_score(y_test, y_pred))

0.2468948803392911
0.4492833517089305
0.1702172096908939


In [27]:
pipeline = make_pipeline(
    ColumnConverter(),
    custom_oe(),
    SelectColumns(cols=num_feats+bool_feats+ord_cat_feats),
    SimpleImputer(strategy="median"),
    GradientBoostingClassifier(learning_rate=.1,
                               n_estimators=100,
                               subsample=0.2,
                               n_iter_no_change=10,
                               max_features = None,
                               random_state=42)
)
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

# To use on other cells
print(f1_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(precision_score(y_test, y_pred))

0.01953336950623983
0.009922822491730982
0.6206896551724138


In [28]:
pipeline = make_pipeline(
    ColumnConverter(),
    custom_oe(),
    SelectColumns(cols=num_feats+bool_feats+ord_cat_feats),
    SimpleImputer(strategy="median"),
    GridSearchCV(DecisionTreeClassifier(random_state=42),
                                      param_grid = {'max_depth': range(1, 10),
                                                    'max_features': ['auto', 'sqrt', 'log2'],
                                                    #'max_features': range(1, X.shape[1]),
                                                    'criterion': ['entropy', 'gini']},
                                      cv=5,
                                      scoring="f1", #or should it be 'accuracy'
                                      return_train_score=True))
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

# To use on other cells
print(f1_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(precision_score(y_test, y_pred))

0.0193756727664155
0.009922822491730982
0.4090909090909091


# Adding categorical variables

In [52]:
cat_features = ['race', #done
                'gender', #done
                'admission_type_code', #done 
                'discharge_disposition_code', #done
                #'admission_source_code', 
                'payer_code', #done (now the insurance column)
                'medical_specialty', #done (any specialty over 100 - lets see how bad it gets)
                #'diag_1',
                #'diag_2',
                #'diag_3',
                'blood_type'] #dropping blood type

1. Will drop blood type - too many inconsistencies and not feasible to guess
1. Does admission type and code have a role?
1. Race, gender can be encoded
1. Payer code 'SP' is an indicator of uninsured patients - what to do with '?' should we make 3 categories?
1. 

Processing Gender

In [53]:
def processgender(df):
    _df = df.copy()
    _df['gender'] = _df['gender'].str.lower()
    valid_genders = ['male', 'female']
    _df.loc[~(_df['gender'].isin(valid_genders)), 'age'] = 'unknown'
    return _df

Processing Race

In [31]:
def processrace(df):
    _df = df.copy()
    _df['race'] = _df['race'].str.lower().str.lstrip().str[0:3]
    black = ['afr', 'bla']
    white = ['cau', 'whi', 'eur']
    hispanic = ['his', 'lat']
    asian = ['asi']
    race_options = black+white+hispanic+asian
    _df.loc[~(_df['race'].isin(race_options)), 'race'] = 'unknown/other'
    _df.loc[(_df['race'].isin(black)), 'race'] = 'black'
    _df.loc[(_df['race'].isin(white)), 'race'] = 'white'
    _df.loc[(_df['race'].isin(hispanic)), 'race'] = 'hispanic'
    _df.loc[(_df['race'].isin(asian)), 'race'] = 'asian'
    return _df

Process insurance status and payer code

In [32]:
def insurancestatus(df):
    _df = testdf.copy()
    _df.payer_code = _df.payer_code.where(_df.payer_code != '?')
    _df.payer_code = _df.payer_code.fillna(value='unknown')
    _df.loc[(_df['payer_code'] == 'SP'), 'payer_code'] == 'uninsured'
    _df.loc[~(_df['payer_code'].isin(['SP', 'unknown'])), 'payer_code'] = 'insured'
    return _df

In [33]:
testdf = X_train.copy()
testdf.payer_code.value_counts()

?     25983
MC    20620
HM     3937
SP     3212
BC     2959
MD     2278
CP     1647
UN     1555
CM     1235
OG      622
PO      391
DM      353
CH      101
WC       83
OT       62
MP       57
SI       33
FR        1
Name: payer_code, dtype: int64

In [34]:
testdf.payer_code=testdf.payer_code.where(testdf.payer_code != '?')

In [35]:
testdf.payer_code.isnull().sum()

25983

In [36]:
df = insurancestatus(testdf)

Processing Admission Type Code

In [37]:
def processadmissiontype(df):
    _df = df.copy()
    _df.loc[(_df['admission_type_code'].isin([5, 6, 8])), 'admission_type_code'] = 'n/a'
    _df.loc[(_df['admission_type_code'] == 1), 'admission_type_code'] = 'emergency'
    _df.loc[(_df['admission_type_code'] == 2), 'admission_type_code'] = 'urgent'
    _df.loc[(_df['admission_type_code'] == 3), 'admission_type_code'] = 'elective'
    _df.loc[(_df['admission_type_code'] == 4), 'admission_type_code'] = 'newborn'
    _df.loc[(_df['admission_type_code'] == 7), 'admission_type_code'] = 'trauma'
    return _df

Processing Admission Source Code

Processing Discharge Disposition Code

In [38]:
def processdischargecode(df):
    _df = df.copy()

    home = [1]
    left_ama = [7]
    hospice = [13, 14]
    transferred = [2, 3, 4, 5, 9, 10, 15, 22, 23, 24, 27, 28, 29]
    died = [11, 19, 20, 21]
    for_outpatient_services = [12, 16, 17]
    home_services = [6, 8]
    all_cats = home+left_ama+hospice+transferred+died+for_outpatient_services+home_services

    _df.loc[~(_df['discharge_disposition_code'].isin(all_cats)), 'discharge_disposition_code'] = 'unknown'
    _df.loc[(_df['discharge_disposition_code'].isin(home)), 'discharge_disposition_code'] = 'discharged_home'
    _df.loc[(_df['discharge_disposition_code'].isin(left_ama)), 'discharge_disposition_code'] = 'left_ama'
    _df.loc[(_df['discharge_disposition_code'].isin(hospice)), 'discharge_disposition_code'] = 'discharged_hospice'
    _df.loc[(_df['discharge_disposition_code'].isin(transferred)), 'discharge_disposition_code'] = 'transferred_inpatient'
    _df.loc[(_df['discharge_disposition_code'].isin(died)), 'discharge_disposition_code'] = 'expired'
    _df.loc[(_df['discharge_disposition_code'].isin(for_outpatient_services)), 'discharge_disposition_code'] = 'transferred_outpatient'
    _df.loc[(_df['discharge_disposition_code'].isin(home_services)), 'discharge_disposition_code'] = 'home_care'
    
    return _df

Process specializations

In [39]:
testdf = X_train.copy()[cat_features]

In [40]:
def processspecializations(df):
    _df = df.copy()
    _df['medical_specialty'] = _df['medical_specialty'].str.lower()
    counts = pd.DataFrame(_df.medical_specialty.value_counts())
    df_mask = counts['medical_specialty']>=100
    selected_specialties = counts[df_mask]
    selected_specialties = selected_specialties.index.to_list()
    _df.loc[~(_df['medical_specialty'].isin(selected_specialties)), 'medical_specialty'] = 'other'
    _df.loc[(_df['medical_specialty'] == '?'), 'medical_specialty'] = 'unknown'
    return _df

In [41]:
_df=processspecializations(testdf)

In [42]:
_df.medical_specialty.value_counts()

unknown                              31970
internalmedicine                      9381
family/generalpractice                4820
emergency/trauma                      4779
cardiology                            3409
surgery-general                       1979
nephrology                            1031
other                                  961
orthopedics                            860
orthopedics-reconstructive             777
radiologist                            714
pulmonology                            569
psychiatry                             537
urology                                452
surgery-cardiovascular/thoracic        422
obstetricsandgynecology                420
gastroenterology                       371
surgery-vascular                       364
surgery-neuro                          304
physicalmedicineandrehabilitation      244
oncology                               220
pediatrics                             168
neurology                              143
hematology/

# A quick test

In [61]:
testdf = X_train.copy()
cc = ColumnConverter()
coe = custom_oe()
testdf = cc.fit_transform(testdf)
testdf = coe.fit_transform(testdf)

In [62]:
caten = ce.OneHotEncoder(verbose=1, cols=cat_features, handle_missing="indicator", use_cat_names=True)
testdf = caten.fit_transform(testdf)

In [63]:
testdf.head()

Unnamed: 0_level_0,patient_id,race_white,race_black,race_unknown/other,race_hispanic,race_asian,race_nan,gender_male,gender_female,gender_unknown,...,blood_type_O-,blood_type_nan,hemoglobin_level,blood_transfusion,max_glu_serum,A1Cresult,diuretics,insulin,change,diabetesMed
admission_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6400,120523950,1,0,0,0,0,0,1,0,0,...,0,0,15.4,0,-1.0,3.0,0,1,1,1
4000,46605996,0,1,0,0,0,0,0,1,0,...,0,0,13.8,0,-1.0,-1.0,0,1,1,1
39818,7574652,0,1,0,0,0,0,0,1,0,...,0,0,13.6,0,-1.0,-1.0,0,1,0,1
23698,72967194,1,0,0,0,0,0,0,1,0,...,0,0,14.1,0,-1.0,-1.0,0,0,0,1
71883,206791254,1,0,0,0,0,0,0,1,0,...,0,0,13.2,0,-1.0,-1.0,0,0,0,1


In [55]:
pipeline = make_pipeline(
    ColumnConverter(),
    custom_oe(),
    ce.OneHotEncoder(verbose=1,   ### This needs to be a column transformer
                     cols=cat_features,
                     handle_missing="indicator",
                     use_cat_names=True),
    SelectColumns(cols=num_feats+bool_feats+ord_cat_feats+cat_features),
    SimpleImputer(strategy="median"),
    RandomForestClassifier(max_depth=10, class_weight="balanced", random_state=42, n_jobs=-1),
)
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

# To use on other cells
print(f1_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(precision_score(y_test, y_pred))

KeyError: "['race', 'blood_type', 'payer_code', 'admission_type_code', 'medical_specialty', 'gender', 'discharge_disposition_code'] not in index"

# Output files (pickling)

In [None]:
TMP_DIR = '/tmp'
with open(os.path.join(TMP_DIR, "columns.json"), 'w') as fh:
    json.dump(X_train.columns.tolist(), fh)

with open(os.path.join(TMP_DIR, "dtypes.pickle"), 'wb') as fh:
    pickle.dump(X_train.dtypes, fh)
    
joblib.dump(pipeline, os.path.join(TMP_DIR, 'pipeline.pickle')) 