In [1]:
import pandas as pd
import numpy as np
import os, sys, joblib, time
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
# GLOBALS
LOCAL_ROOT = '/Users/nathvaru/Documents/personal/AV/janatahack_healthcare_analytics_II/'
DATA_DIR = os.path.join(LOCAL_ROOT, 'data')
TRAIN_FN = os.path.join(DATA_DIR, 'Train_hMYJ020/train.csv')
TEST_FN = os.path.join(DATA_DIR, 'test.csv')
SUBMISSION_FN = os.path.join(DATA_DIR, 'sample_submission_lfbv3c3.csv')

In [3]:
# read data
df_train = pd.read_csv(TRAIN_FN)
df_test = pd.read_csv(TEST_FN)

In [4]:
cat_vars = ['Hospital_code', 'Hospital_type_code',
            'City_Code_Hospital', 'Hospital_region_code',
            'Department', 'Ward_Type', 'Ward_Facility_Code',
            'Bed Grade', 'City_Code_Patient',
            'Type of Admission', 'Severity of Illness', 'Age']
num_vars = ['Available Extra Rooms in Hospital',
            'Visitors with Patient', 'Admission_Deposit']

In [5]:
# Impute missing values in Bed Grade and City_Code_Patient
# with separate category
df_train.fillna({'Bed Grade': -999, 'City_Code_Patient': -999},
                inplace=True)
df_test.fillna({'Bed Grade': -999, 'City_Code_Patient': -999},
                inplace=True)

In [6]:
# preprocess cat_vars
for var in cat_vars:
    if df_train[var].dtypes == object:
        print(var)
        df_train[var] = df_train[var].apply(
            lambda x: str(x).strip().replace(" ", "-"))
        df_test[var] = df_test[var].apply(
            lambda x: str(x).strip().replace(" ", "-"))

Hospital_type_code
Hospital_region_code
Department
Ward_Type
Ward_Facility_Code
Type of Admission
Severity of Illness
Age


In [7]:
# encode target
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(df_train['Stay'].values)

df_train['DV'] = le.transform(df_train['Stay'].values)

In [8]:
# add prefix to all features
FEAT_PREFIX = 'JHA'
cols = list(df_test.columns)
new_cols = [FEAT_PREFIX + '_'+ col.replace(" ", "-")
            if col not in ('case_id', 'Stay', 'DV') else col for col in cols]
rename_dct = dict(zip(cols, new_cols))
df_train.rename(columns=rename_dct, inplace=True)
df_test.rename(columns=rename_dct, inplace=True)

In [9]:
# outlier treatment and scaling for num_vars
from utility import LegacyOutlierScaler
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


PREPROCESS = {
    'exoutscaler': LegacyOutlierScaler(),
    'stdscaler': StandardScaler()
}
STEPS = ['exoutscaler', 'stdscaler']


def preprocess(train, test, steps, features):
    """
    imputation, outlier treatment and scaling
    """
    train = train.copy()
    test = test.copy()
    other_cols = list(set(list(test.columns)) - set(features))
    classic_steps = steps
    steps = list(zip(steps, map(PREPROCESS.get, steps)))
    datapipe = Pipeline(steps=steps)

    x_dev = train[features].values
    
    print('fit')
    datapipe.fit(x_dev)
    
    print('transform dataframe using pipeline')
    print('train data:')
    train1 = datapipe.transform(train[features].values)
    train1 = pd.DataFrame(train1, columns=features)
    train1 = pd.concat([train1, train[other_cols+['Stay', 'DV']]], axis=1)
    print('test data:')
    test1 = datapipe.transform(test[features].values)
    test1 = pd.DataFrame(test1, columns=features)
    test1 = pd.concat([test1, test[other_cols]], axis=1)
    
    # Create "classic" datapipe and store list of features
    classic_pipe = Pipeline([(name, datapipe.named_steps[name])
                             for name in classic_steps])
    classic_pipe.feature_names = features

    return train1, test1, classic_pipe

In [10]:
num_vars = [FEAT_PREFIX + '_'+ col.replace(" ", "-") for col in num_vars]
df_train_pre, df_test_pre, pipeline = preprocess(df_train, df_test, STEPS, num_vars)

fit
transform dataframe using pipeline
train data:
test data:


In [11]:
cat_feats = [FEAT_PREFIX + '_'+ col.replace(" ", "-") for col in cat_vars]
print(cat_feats)
feat_cols = [col for col in list(df_train_pre.columns) if col.startswith(FEAT_PREFIX)]
cat_feats_indices = [i for i, col in enumerate(feat_cols) if col in cat_feats]
print(cat_feats_indices)

['JHA_Hospital_code', 'JHA_Hospital_type_code', 'JHA_City_Code_Hospital', 'JHA_Hospital_region_code', 'JHA_Department', 'JHA_Ward_Type', 'JHA_Ward_Facility_Code', 'JHA_Bed-Grade', 'JHA_City_Code_Patient', 'JHA_Type-of-Admission', 'JHA_Severity-of-Illness', 'JHA_Age']
[3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]


In [12]:
# encode cat_feats

le_feats = []
for feat in cat_feats:
    print(feat)
    le_feat = LabelEncoder()
    le_feat.fit(df_train_pre[feat].values)
    df_train_pre[feat] = le_feat.transform(df_train_pre[feat].values)
    df_test_pre[feat] = le_feat.transform(df_test_pre[feat].values)
    le_feats.append((feat, le_feat))
    print('\n')

JHA_Hospital_code


JHA_Hospital_type_code


JHA_City_Code_Hospital


JHA_Hospital_region_code


JHA_Department


JHA_Ward_Type


JHA_Ward_Facility_Code


JHA_Bed-Grade


JHA_City_Code_Patient


JHA_Type-of-Admission


JHA_Severity-of-Illness


JHA_Age




In [13]:
x_train = df_train_pre[feat_cols]
y_train = df_train_pre['DV']
x_test = df_test_pre[feat_cols]

In [14]:
from catboost import Pool, CatBoostClassifier
from sklearn import metrics


def runCatboost(train_X, train_y, test_X, test_y=None, test_X2=None, **params):
    
    # init model class
    model = CatBoostClassifier(
        iterations = params['iterations'],
        learning_rate = params['lr'],
        random_strength = params['random_strength'],
        random_seed = 2020,
        l2_leaf_reg = 3.0,
        early_stopping_rounds = 100,
        classes_count = 11,
        depth = params['depth'],
        loss_function = 'MultiClass',
        eval_metric = 'Accuracy',
        leaf_estimation_method = 'Newton'
    )
    
    # fit
    model.fit(train_X, train_y, eval_set=(test_X, test_y), plot=params['plot'],
              cat_features=params['cat_feats'])
    
    # predict
    pred_val = model.predict(test_X)
    if test_X2 is not None:
        pred_test = model.predict(test_X2)
    else:
        pred_test = None
        
    loss = metrics.accuracy_score(test_y, pred_val)
    
    return pred_val, loss, pred_test

In [26]:
from sklearn.model_selection import KFold


def trainModel(train_X, train_y, test_X, n_splits, model_name, feats, **params):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=2020)
    cv_scores = []
    pred_test_full = []
    pred_val_full = np.zeros(train_X.shape[0])
    for dev_index, val_index in kf.split(train_X):
        dev_X, val_X = train_X.iloc[dev_index, :], train_X.iloc[val_index, :]
        dev_y, val_y = train_y[dev_index], train_y[val_index]

        if model_name == "XGB":
            pred_val, acc, pred_test = runXGB(
             dev_X, dev_y, val_X, val_y, test_X, rounds=params['rounds'],
             dep=params['depth'], eta=params['eta'], feature_names=feats)
        elif model_name == "LGB":
            pred_val, acc, pred_test = runLGB(
             dev_X, dev_y, val_X, val_y, test_X, rounds=params['rounds'],
             dep=params['depth'], eta=params['eta'])
        elif model_name == "Catboost":
            pred_val, acc, pred_test = runCatboost(dev_X, dev_y, val_X, val_y, test_X,
                                                   **params)
        
        cv_scores.append(acc)
        pred_val_full[val_index] = pred_val
        if pred_test is not None:
            pred_test_full.append(pred_test)

    #pred_test_full = pred_test_full/n_splits
    acc = metrics.accuracy_score(train_y, pred_val_full)
    return pred_val_full, acc, pred_test_full, cv_scores

In [27]:
params = {'iterations': 500, 'lr': 0.1, 'random_strength': 0.1, 'depth': 7, 'plot': False,
          'cat_feats': cat_feats_indices}
start = time.time()
pred_val, acc, pred_test = trainModel(x_train, y_train, x_test, 3, 'Catboost', feat_cols,
                                      **params)
print('time taken: %0.2f' % (time.time() - start))

0:	learn: 0.3760716	test: 0.3769902	best: 0.3769902 (0)	total: 6.66s	remaining: 55m 25s
1:	learn: 0.3796469	test: 0.3798542	best: 0.3798542 (1)	total: 14.9s	remaining: 1h 1m 54s
2:	learn: 0.3802546	test: 0.3800049	best: 0.3800049 (2)	total: 23.4s	remaining: 1h 4m 29s
3:	learn: 0.3809093	test: 0.3807963	best: 0.3807963 (3)	total: 31.7s	remaining: 1h 5m 34s
4:	learn: 0.3813615	test: 0.3816442	best: 0.3816442 (4)	total: 40.9s	remaining: 1h 7m 24s
5:	learn: 0.3813521	test: 0.3815499	best: 0.3816442 (4)	total: 51.9s	remaining: 1h 11m 14s
6:	learn: 0.3874663	test: 0.3874381	best: 0.3874381 (6)	total: 1m 2s	remaining: 1h 13m 2s
7:	learn: 0.3904057	test: 0.3910746	best: 0.3910746 (7)	total: 1m 18s	remaining: 1h 20m 3s
8:	learn: 0.3924218	test: 0.3928645	best: 0.3928645 (8)	total: 1m 30s	remaining: 1h 22m 13s
9:	learn: 0.3962749	test: 0.3965482	best: 0.3965482 (9)	total: 1m 40s	remaining: 1h 22m 22s
10:	learn: 0.3966565	test: 0.3966329	best: 0.3966329 (10)	total: 1m 52s	remaining: 1h 23m 11s
11

88:	learn: 0.4281838	test: 0.4244625	best: 0.4245285 (85)	total: 18m 21s	remaining: 1h 24m 48s
89:	learn: 0.4282545	test: 0.4245379	best: 0.4245379 (89)	total: 18m 33s	remaining: 1h 24m 34s
90:	learn: 0.4283110	test: 0.4247263	best: 0.4247263 (90)	total: 18m 47s	remaining: 1h 24m 26s
91:	learn: 0.4284005	test: 0.4246698	best: 0.4247263 (90)	total: 18m 58s	remaining: 1h 24m 9s
92:	learn: 0.4286549	test: 0.4249901	best: 0.4249901 (92)	total: 19m 11s	remaining: 1h 24m 1s
93:	learn: 0.4288480	test: 0.4251032	best: 0.4251032 (93)	total: 19m 25s	remaining: 1h 23m 55s
94:	learn: 0.4287773	test: 0.4253481	best: 0.4253481 (94)	total: 19m 40s	remaining: 1h 23m 50s
95:	learn: 0.4288244	test: 0.4254517	best: 0.4254517 (95)	total: 19m 52s	remaining: 1h 23m 39s
96:	learn: 0.4290317	test: 0.4256684	best: 0.4256684 (96)	total: 20m 6s	remaining: 1h 23m 34s
97:	learn: 0.4291683	test: 0.4254235	best: 0.4256684 (96)	total: 20m 20s	remaining: 1h 23m 25s
98:	learn: 0.4291448	test: 0.4255365	best: 0.4256684 

KeyboardInterrupt: 

(318438,)