# Imports

In [1]:
import os
import base64
import joblib
import pandas as pd
import numpy as np
import category_encoders as ce
import json
import joblib
import pickle
import math
import requests
from copy import deepcopy
import seaborn as sns
from uuid import uuid4

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import cross_val_score
from category_encoders import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score

from sklearn.base import BaseEstimator, TransformerMixin

from pandas_profiling import ProfileReport

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
plt.style.use('ggplot')
%matplotlib inline

# Functions

In [2]:
def load_train_data():
    df = pd.read_csv(os.path.join("data", "train_data.csv"))
    return df

data_train = load_train_data()

In [3]:
def assign_index(df, indexcol: str):
    _df = df.copy()
    _df = _df.set_index(indexcol)
    return _df

In [4]:
def assign_categorical(df, column_list: []):
    _df = df.copy()
    for column in column_list:
        _df[column] = _df[column].astype('category')
    return _df

In [5]:
def encode_unordered_categorical(df, column_list: []):
    _df = df.copy()
    ohe = ce.OneHotEncoder(verbose=1,
                           cols=column_list,
                           handle_missing="indicator",
                           use_cat_names=True)
    _df = ohe.fit_transform(_df)
    return _df

In [6]:
def binarymap(df):
    _df = df.copy()
    _df = _df.assign(has_prosthesis = _df['has_prosthesis'].map({True: 1, False: 0}),
                       blood_transfusion = _df['blood_transfusion'].map({True: 1, False: 0}),
                       diuretics = _df['diuretics'].map({'Yes': 1, 'No': 0}),
                       insulin = _df['insulin'].map({'Yes': 1, 'No': 0}),
                       change = _df['change'].map({'Ch': 1, 'No': 0}),
                       diabetesMed = _df['diabetesMed'].map({'Yes': 1, 'No': 0}))
    return _df

In [7]:
def encode_ordinal_categorical(df, column_list: []):
    _df = df.copy()
    orde = ce.OrdinalEncoder(verbose=1,
                             cols=column_list,
                             handle_unknown='value',
                             handle_missing='value')
    _df = orde.fit_transform(_df)
    return _df

In [8]:
def build_target(df, target: str):
    _df = df.copy()
    _df[target] = np.where(_df[target]== 'Yes', True, False)
    return _df

# Custom Transformers and pipeline objects

In [9]:
class DroppingColumns(BaseEstimator, TransformerMixin):
    def __init__(self, cols=[]):
        self.cols = cols
    def fit(self, X=None, y=None, **fit_params):
        return self
    def transform(self, data):
        X = data.copy()
        X = X.drop(self.cols,axis=1)
        return X

In [10]:
class SelectColumns(BaseEstimator, TransformerMixin):
    def __init__(self, cols=[]):
        self.cols = cols
    def fit(self, X=None, y=None, **fit_params):
        return self
    def transform(self, data):
        X = data.copy()
        X = X[self.cols]
        return X

In [11]:
class BinaryProcessor(BaseEstimator, TransformerMixin):
    def fit(self, X=None, y=None, **fit_params):
        return self
    def transform(self, data):
        X = data.copy()
        X = binarymap(X)
        return X

In [12]:
class AssignCategorical(BaseEstimator, TransformerMixin):
    def __init__(self, cols=[]):
        self.cols = cols
    def fit(self, X=None, y=None, **fit_params):
        return self
    def transform(self, data):
        X = data.copy()
        X = assign_categorical(X, column_list=self.cols)
        return X

# Simple baseline

In [13]:
target = 'readmitted'
index_col = 'admission_id'

In [14]:
data_train = load_train_data()
data_train = assign_index(data_train, index_col)
data_train = build_target(data_train, target)

In [15]:
#List of all features (n=numerical, uc=unordered actegory, oc=ordered category, b=boolean)

all_features = ['patient_id', #identifier
                'race', #uc
                'gender', #uc
                'age', #oc
                'weight', #oc               
                'admission_type_code', #uc
                'discharge_disposition_code', #uc
                'admission_source_code', #uc
                'time_in_hospital', #n
                'payer_code', #uc
                'medical_specialty', #uc
                'has_prosthesis', #b
                'complete_vaccination_status', #uc
                'num_lab_procedures', #n
                'num_procedures', #n
                'num_medications', #n
                'number_outpatient', #n
                'number_emergency', #n
                'number_inpatient', #n
                'diag_1', #uc
                'diag_2', #uc
                'diag_3', #uc
                'number_diagnoses', #n
                'blood_type', #uc
                'hemoglobin_level', #n
                'blood_transfusion', #b
                'max_glu_serum', #oc
                'A1Cresult', #oc
                'diuretics', #b
                'insulin', #b
                'change', #b
                'diabetesMed'] #b]

num_features = ['time_in_hospital', 
                'num_lab_procedures',
                'num_procedures',
                'num_medications',
                'number_outpatient',
                'number_emergency',
                'number_inpatient',
                'number_diagnoses',
                'hemoglobin_level']

bool_features = ['has_prosthesis',
                 'blood_transfusion',
                 'diuretics',
                 'insulin',
                 'change',
                 'diabetesMed']

cat_features = ['race',
                'gender',
                'admission_type_code', 
                'discharge_disposition_code',
                'admission_source_code', 
                'payer_code',
                'medical_specialty',
                'complete_vaccination_status',
                'diag_1',
                'diag_2',
                'diag_3',
                'blood_type',
                'max_glu_serum',
                'A1Cresult']

ord_cat_features = ['age',
                    'weight']

# missing features: date of admission and date of release

In [16]:
# Convert target to boolean
df_train, df_test = train_test_split(data_train, test_size=0.2, random_state=42, stratify=data_train[target])
X_train = df_train.drop(target, axis=1)
y_train = df_train[target]
X_test = df_test.drop(target, axis=1)
y_test = df_test[target]

In [17]:
selected_features = ['time_in_hospital', 
                     'num_lab_procedures',
                     'num_procedures',
                     'num_medications',
                     'number_outpatient',
                     'number_emergency',
                     'number_inpatient',
                     'number_diagnoses',
                     'hemoglobin_level',
                     
                     'has_prosthesis',
                     'blood_transfusion',
                     'diuretics',
                     'insulin',
                     'change',
                     'diabetesMed',
                     
                     'race',
                     'gender',
                     'admission_type_code', 
                     'discharge_disposition_code',
                     'admission_source_code', 
                     'payer_code',
                     'medical_specialty',
                     'complete_vaccination_status',
                     'blood_type',
                     'max_glu_serum',
                     'A1Cresult',
                     
                     'age',
                     'weight']

In [18]:
num_feats = [feat for feat in selected_features if feat in num_features]
cat_feats = [feat for feat in selected_features if feat in cat_features+bool_features+ord_cat_features]
bool_feats = [feat for feat in selected_features if feat in bool_features]
ord_cat_feats = [feat for feat in selected_features if feat in ord_cat_features]

# Pipeline 1 - just numerical features

In [19]:
# Just using numerical features

pipeline = make_pipeline(
    SelectColumns(cols=num_feats),
    SimpleImputer(strategy="median"),
    RandomForestClassifier(max_depth=10, class_weight="balanced", random_state=42, n_jobs=-1),
)
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

# To use on other cells
print(f1_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(precision_score(y_test, y_pred))

0.23879680785758134
0.4288864388092613
0.16546150574223734


# Pipeline 2 - adding the binary features

In [20]:
# Adding the binary features

pipeline = make_pipeline(
    SelectColumns(cols=num_feats+bool_feats),
    AssignCategorical(cols=bool_feats),
    BinaryProcessor(),
    SimpleImputer(strategy="median"),
    RandomForestClassifier(max_depth=10, class_weight="balanced", random_state=42, n_jobs=-1),
)
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

# To use on other cells
print(f1_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(precision_score(y_test, y_pred))

0.24461091576211585
0.4410143329658214
0.16924053310767928


# Pipeline 3 - adding ordered categorical variables

In [21]:
X_train.columns

Index(['patient_id', 'race', 'gender', 'age', 'weight', 'admission_type_code',
       'discharge_disposition_code', 'admission_source_code',
       'time_in_hospital', 'payer_code', 'medical_specialty', 'has_prosthesis',
       'complete_vaccination_status', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 'number_diagnoses',
       'blood_type', 'hemoglobin_level', 'blood_transfusion', 'max_glu_serum',
       'A1Cresult', 'diuretics', 'insulin', 'change', 'diabetesMed'],
      dtype='object')

In [22]:
X_train.A1Cresult.value_counts()

None    54203
>8       5241
Norm     3204
>7       2481
Name: A1Cresult, dtype: int64

age, weight, max_glu_serum, A1Cresult

In [85]:
testdf = X_train.copy()[['age', 'weight', 'max_glu_serum', 'A1Cresult']]
testdf['age'] = testdf['age'].str.lstrip("\[").str.rstrip(")")
testdf['weight'] = testdf['weight'].str.lstrip("\[").str.rstrip(")")
testdf['max_glu_serum'] = testdf['max_glu_serum'].str.lower()
testdf['A1Cresult'] = testdf['A1Cresult'].str.lower()
testdf = assign_categorical(testdf, ['age', 'weight', 'max_glu_serum', 'A1Cresult'])

In [87]:
ordered_age = ['unknown',
               '0-10',
               '10-20',
               '20-30',
               '30-40',
               '40-50',
               '50-60',
               '60-70',
               '70-80',
               '80-90',
               '90-100']

testdf = testdf.assign(age=testdf['age'].cat.set_categories(ordered_age, ordered=True))
testdf.loc[~(testdf['age'].isin(ordered_age)), 'age'] = 'unknown'

ordered_weight = ['unknown',
                  '0-25',
                  '25-50',
                  '50-75',
                  '75-100',
                  '100-125',
                  '125-150',
                  '150-175',
                  '175-200',
                  '>200']
testdf = testdf.assign(weight=testdf['weight'].cat.set_categories(ordered_weight, ordered=True))
testdf.loc[~(testdf['weight'].isin(ordered_weight)), 'weight'] = 'unknown'

ordered_max_glu = ['unknown',
                   'norm',
                   '>200',
                   '>300']
testdf = testdf.assign(max_glu_serum=testdf['max_glu_serum'].cat.set_categories(ordered_max_glu, ordered=True))
testdf.loc[~(testdf['max_glu_serum'].isin(ordered_max_glu)), 'max_glu_serum'] = 'unknown'

ordered_A1C = ['unknown',
               'norm',
               '>7',
               '>8']
testdf = testdf.assign(A1Cresult=testdf['A1Cresult'].cat.set_categories(ordered_A1C, ordered=True))
testdf.loc[~(testdf['A1Cresult'].isin(ordered_A1C)), 'A1Cresult'] = 'unknown'

In [88]:
orde = ce.OrdinalEncoder(verbose=1,
                         cols=['age',
                               'weight',
                               'max_glu_serum',
                               'A1Cresult'],
                         handle_unknown='value',
                         handle_missing='return_nan',
                         mapping = [{'col': 'age', 'mapping': {'0-10':1,
                                                               '10-20':2,
                                                               '20-30':3,
                                                               '30-40':4,
                                                               '40-50':5,
                                                               '50-60':6,
                                                               '60-70':7,
                                                               '70-80':8,
                                                               '80-90':9,
                                                               '90-100':10}},
        
                                    {'col': 'weight', 'mapping': {'0-25':1,
                                                                  '25-50':2, 
                                                                  '50-75':3,
                                                                  '75-100':4,
                                                                  '100-125':5, 
                                                                  '125-150':6, 
                                                                  '150-175':7, 
                                                                  '175-200':8, 
                                                                  '>200':9}},
                                    
                                    {'col': 'max_glu_serum', 'mapping': {'norm':1,
                                                                         '>200':2,
                                                                         '>300':3,}},
                                    
                                    {'col': 'A1Cresult', 'mapping': {'norm':1,
                                                                         '>7':2,
                                                                         '>8':3,}}])
                         
                         

In [90]:
orde.fit(testdf)

OrdinalEncoder(cols=['age', 'weight', 'max_glu_serum', 'A1Cresult'],
               handle_missing='return_nan',
               mapping=[{'col': 'age',
                         'mapping': {'0-10': 1, '10-20': 2, '20-30': 3,
                                     '30-40': 4, '40-50': 5, '50-60': 6,
                                     '60-70': 7, '70-80': 8, '80-90': 9,
                                     '90-100': 10}},
                        {'col': 'weight',
                         'mapping': {'0-25': 1, '100-125': 5, '125-150': 6,
                                     '150-175': 7, '175-200': 8, '25-50': 2,
                                     '50-75': 3, '75-100': 4, '>200': 9}},
                        {'col': 'max_glu_serum',
                         'mapping': {'>200': 2, '>300': 3, 'norm': 1}},
                        {'col': 'A1Cresult',
                         'mapping': {'>7': 2, '>8': 3, 'norm': 1}}],
               verbose=1)

# Output files (pickling)

In [None]:
TMP_DIR = '/tmp'
with open(os.path.join(TMP_DIR, "columns.json"), 'w') as fh:
    json.dump(X_train.columns.tolist(), fh)

with open(os.path.join(TMP_DIR, "dtypes.pickle"), 'wb') as fh:
    pickle.dump(X_train.dtypes, fh)
    
joblib.dump(pipeline, os.path.join(TMP_DIR, 'pipeline.pickle')) 