In [1]:
import pandas as pd
import numpy as np
import os, sys, joblib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import KFold

In [2]:
# GLOBALS
LOCAL_ROOT = '/Users/nathvaru/Documents/personal/AV/janatahack_healthcare_analytics_II/'
DATA_DIR = os.path.join(LOCAL_ROOT, 'data')
TRAIN_FN = os.path.join(DATA_DIR, 'Train_hMYJ020/train.csv')
TEST_FN = os.path.join(DATA_DIR, 'test.csv')
SUBMISSION_FN = os.path.join(DATA_DIR, 'sample_submission_lfbv3c3.csv')

In [3]:
# read data
df_train = pd.read_csv(TRAIN_FN)
df_test = pd.read_csv(TEST_FN)

In [4]:
cat_vars = ['Hospital_code', 'Hospital_type_code',
            'City_Code_Hospital', 'Hospital_region_code',
            'Department', 'Ward_Type', 'Ward_Facility_Code',
            'Bed Grade', 'City_Code_Patient',
            'Type of Admission', 'Severity of Illness', 'Age']
num_vars = ['Available Extra Rooms in Hospital',
            'Visitors with Patient', 'Admission_Deposit']

In [5]:
# Impute missing values in Bed Grade and City_Code_Patient
# with separate category
df_train.fillna({'Bed Grade': 'missing', 'City_Code_Patient': 'missing'},
                inplace=True)
df_test.fillna({'Bed Grade': 'missing', 'City_Code_Patient': 'missing'},
                inplace=True)

In [6]:
def getCountVar(compute_df, count_df, var_name, count_var):
    """
    compute_df : Data frame for which the count encoding should be done
    count_df : Data frame from which the counts should be taken
    var_name : categorical variable for count encoding
    count_var : some other variable from the dataset (used as dummy variable to get count)
    """
    grouped_df = count_df.groupby(var_name, as_index=False)[count_var].agg('count')
    grouped_df.columns = [var_name, "var_count"]
    merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
    merged_df.fillna(-1, inplace=True)
    return list(merged_df["var_count"])


def getDVEncodeVar(compute_df, target_df, var_name, target_var,
                   min_cutoff=1):
    if type(var_name) != type([]):
        var_name = [var_name]
    grouped_df = target_df.groupby(var_name)[target_var].agg(["mean"]).reset_index()
    grouped_df.columns = var_name + ["mean_value"]
    merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
    merged_df.fillna(-1, inplace=True)
    return list(merged_df["mean_value"])


def do_target_encode(train_df, test_df, cols_to_encode, target_col,
                     encode_type, n_splits=3):
        
    kf = KFold(n_splits=n_splits, shuffle=True,
                               random_state=2020)
    for col in cols_to_encode:
        train_enc_values = np.zeros(train_df.shape[0])
        test_enc_values = 0
        for dev_index, val_index in kf.split(train_df):
            new_train_df = train_df[[col, target_col]]
            dev_X, val_X = new_train_df.iloc[dev_index], new_train_df.iloc[val_index]
            
            if encode_type == 'dv':
                train_enc_values[val_index] =  np.array( 
                    getDVEncodeVar(val_X[[col]], dev_X, col, target_col))
                test_enc_values += np.array( 
                    getDVEncodeVar(test_df[[col]], dev_X, col, target_col))
            elif encode_type == 'count':
                train_enc_values[val_index] =  np.array( 
                    getCountVar(val_X[[col]], dev_X, col, target_col))
                test_enc_values += np.array( 
                    getCountVar(test_df[[col]], dev_X, col, target_col))
        
        test_enc_values /= n_splits
        train_df[col + "_{}_enc".format(encode_type)] = train_enc_values
        test_df[col + "_{}_enc".format(encode_type)] = test_enc_values
        
        return train_df, test_df

In [7]:
# preprocess cat_vars
for var in cat_vars:
    if df_train[var].dtypes == object:
        print(var)
        df_train[var] = df_train[var].apply(
            lambda x: str(x).strip().replace(" ", "-").replace(".", ""))
        df_test[var] = df_test[var].apply(
            lambda x: str(x).strip().replace(" ", "-").replace(".", ""))

Hospital_type_code
Hospital_region_code
Department
Ward_Type
Ward_Facility_Code
Bed Grade
City_Code_Patient
Type of Admission
Severity of Illness
Age


In [8]:
df_train['sample'] = 'train'
df_test['sample'] = 'test'
cols = ['case_id', 'sample'] + cat_vars
tmp = pd.concat([df_train[cols], df_test[cols]], axis=0)
tmp.reset_index(drop=True, inplace=True)

print('get dummies')
tmp = pd.get_dummies(tmp, prefix=cat_vars, columns=cat_vars,
                     prefix_sep='_', drop_first=True)

get dummies


In [9]:
mask = tmp['sample'] == 'train'
train = tmp.loc[mask, :]
train.reset_index(drop=True, inplace=True)
train.drop('sample', axis=1, inplace=True)
df_train = pd.merge(df_train[['case_id', 'Stay']+num_vars], train,
                    on='case_id')
del train

mask = tmp['sample'] == 'test'
test = tmp.loc[mask, :]
test.reset_index(drop=True, inplace=True)
test.drop('sample', axis=1, inplace=True)
df_test = pd.merge(df_test[['case_id']+num_vars], test, on='case_id')
del test

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [10]:
print(df_train.shape, df_test.shape)

(318438, 122) (137057, 121)


In [11]:
# add prefix to all features
FEAT_PREFIX = 'JHA'
cols = list(df_test.columns)
new_cols = [FEAT_PREFIX + '_'+ col.replace(" ", "-")
            if col not in ('case_id', 'Stay') else col for col in cols]
rename_dct = dict(zip(cols, new_cols))
df_train.rename(columns=rename_dct, inplace=True)
df_test.rename(columns=rename_dct, inplace=True)

In [12]:
# outlier treatment and scaling for num_vars
from utility import LegacyOutlierScaler
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


PREPROCESS = {
    'exoutscaler': LegacyOutlierScaler(),
    'stdscaler': StandardScaler()
}
STEPS = ['exoutscaler', 'stdscaler']


def preprocess(train, test, steps, features):
    """
    imputation, outlier treatment and scaling
    """
    train = train.copy()
    test = test.copy()
    other_cols = list(set(list(test.columns)) - set(features))
    classic_steps = steps
    steps = list(zip(steps, map(PREPROCESS.get, steps)))
    datapipe = Pipeline(steps=steps)

    x_dev = train[features].values
    
    print('fit')
    datapipe.fit(x_dev)
    
    print('transform dataframe using pipeline')
    print('train data:')
    train1 = datapipe.transform(train[features].values)
    train1 = pd.DataFrame(train1, columns=features)
    train1 = pd.concat([train1, train[other_cols+['Stay']]], axis=1)
    print('test data:')
    test1 = datapipe.transform(test[features].values)
    test1 = pd.DataFrame(test1, columns=features)
    test1 = pd.concat([test1, test[other_cols]], axis=1)
    
    # Create "classic" datapipe and store list of features
    classic_pipe = Pipeline([(name, datapipe.named_steps[name])
                             for name in classic_steps])
    classic_pipe.feature_names = features

    return train1, test1, classic_pipe

In [13]:
num_vars = [FEAT_PREFIX + '_'+ col.replace(" ", "-") for col in num_vars]
df_train_pre, df_test_pre, pipeline = preprocess(
    df_train, df_test, STEPS, num_vars)

fit
transform dataframe using pipeline
train data:
test data:


In [14]:
# encode target
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(df_train_pre['Stay'].values)

df_train_pre['DV'] = le.transform(df_train_pre['Stay'].values)

In [70]:
# modelling
import xgboost as xgb
import lightgbm as lgb
from sklearn import metrics
import operator
from catboost import Pool, CatBoostClassifier


def create_feature_map(features):
    outfile = open('../model/xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()


def runXGB(train_X, train_y, test_X, test_y=None, test_X2=None,
           feature_names=None, seed_val=0, rounds=500, dep=8, eta=0.05):
    params = {}
    params["objective"] = "multi:softmax"
    params["num_class"] = 11
    params['eval_metric'] = "merror"
    params["eta"] = eta
    params["subsample"] = 0.7
    params["min_child_weight"] = 1
    params["colsample_bytree"] = 0.7
    params["max_depth"] = dep

    params["silent"] = 1
    params["seed"] = seed_val
    # params["max_delta_step"] = 2
    # params["gamma"] = 0.5
    num_rounds = rounds

    plst = list(params.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [(xgtrain, 'train'), (xgtest, 'test')]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist,
                          early_stopping_rounds=100, verbose_eval=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    if feature_names is not None:
        create_feature_map(feature_names)
        model.dump_model('../model/xgbmodel.txt', '../model/xgb.fmap',
                         with_stats=True)
        importance = model.get_fscore(fmap='../model/xgb.fmap')
        importance = sorted(importance.items(), key=operator.itemgetter(1),
                            reverse=True)
        imp_df = pd.DataFrame(importance, columns=['feature', 'fscore'])
        imp_df['fscore'] = imp_df['fscore'] / imp_df['fscore'].sum()
        imp_df.to_csv("imp_feat.txt", index=False)

    pred_test_y = model.predict(xgtest,
                                ntree_limit=model.best_ntree_limit)
    if test_X2 is not None:
        pred_test_y2 = model.predict(xgb.DMatrix(test_X2),
                                     ntree_limit=model.best_ntree_limit)
    else:
        pred_test_y2 = None

    loss = 0
    if test_y is not None:
        loss = metrics.accuracy_score(test_y, pred_test_y)

    return pred_test_y, loss, pred_test_y2


def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None,
           feature_names=None, seed_val=0, rounds=500, dep=8, eta=0.05):
    params = {}
    params["objective"] = "multiclass"
    params["num_class"] = 11
    params['metric'] = "multi_error"
    params['seed'] = seed_val
    params["max_depth"] = dep
    params["num_leaves"] = 70
    params["min_data_in_leaf"] = 20
    params["learning_rate"] = eta
    params["bagging_fraction"] = 0.7
    params["feature_fraction"] = 0.7
    params["bagging_freq"] = 5
    params["bagging_seed"] = seed_val
    params["verbosity"] = 0
    num_rounds = rounds

    lgtrain = lgb.Dataset(train_X, label=train_y)

    if test_y is not None:
        lgtest = lgb.Dataset(test_X, label=test_y)
        model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest],
                          early_stopping_rounds=100, verbose_eval=20)
    else:
        lgtest = lgb.DMatrix(test_X)
        model = lgb.train(params, lgtrain, num_rounds)

    pred_test_y = model.predict(test_X,
                                num_iteration=model.best_iteration)
    pred_test_y = pred_test_y.argmax(axis=1)
    
    if test_X2 is not None:
        pred_test_y2 = model.predict(test_X2,
                                     num_iteration=model.best_iteration)
        pred_test_y2 = pred_test_y2.argmax(axis=1)
    else:
        pred_test_y2 = None
        
    loss = 0
    if test_y is not None:
        loss = metrics.accuracy_score(test_y, pred_test_y)

    return pred_test_y, loss, pred_test_y2

In [83]:
# Model building

def trainModel(train_X, train_y, test_X, n_splits, model_name, feats, 
               **params):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=2020)
    cv_scores = []
    pred_test_full = []
    pred_val_full = np.zeros(train_X.shape[0])
    for dev_index, val_index in kf.split(train_X):
        dev_X, val_X = train_X.iloc[dev_index, :], train_X.iloc[val_index, :]
        dev_y, val_y = train_y[dev_index], train_y[val_index]

        if model_name == "XGB":
            pred_val, acc, pred_test = runXGB(
             dev_X, dev_y, val_X, val_y, test_X, rounds=params['rounds'],
             dep=params['depth'], eta=params['eta'], feature_names=feats)
        elif model_name == "LGB":
            pred_val, acc, pred_test = runLGB(
             dev_X, dev_y, val_X, val_y, test_X, rounds=params['rounds'],
             dep=params['depth'], eta=params['eta'])
        
        cv_scores.append(acc)
        pred_val_full[val_index] = pred_val
        if pred_test is not None:
            pred_test_full.append(pred_test)

    #pred_test_full = pred_test_full/n_splits
    acc = metrics.accuracy_score(train_y, pred_val_full)
    return pred_val_full, acc, pred_test_full, cv_scores

In [60]:
feat_cols = [x for x in list(df_train_pre.columns)
             if x.startswith(FEAT_PREFIX)]
x_train = df_train_pre[feat_cols]
y_train = df_train_pre['DV']
x_test = df_test_pre[feat_cols]

In [35]:
# XGB
params = {'rounds': 600, 'depth': 6, 'eta': 0.05}
%time pred_val_full, acc, pred_test_full, cv_scores = trainModel(x_train, y_train, x_test, 3, "XGB", feat_cols, **params)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-merror:0.61706	test-merror:0.61803
Multiple eval metrics have been passed: 'test-merror' will be used for early stopping.

Will train until test-merror hasn't improved in 100 rounds.
[20]	train-merror:0.59233	test-merror:0.59528
[40]	train-merror:0.58279	test-merror:0.58754
[60]	train-merror:0.57837	test-merror:0.58446
[80]	train-merror:0.57450	test-merror:0.58212
[100]	train-merror:0.57157	test-merror:0.58045
[120]	train-merror:0.56923	test-merror:0.57933
[140]	train-merror:0.56680	test-merror:0.57813
[160]	train-merror:0.56462	test-merror:0.57743
[180]	train-merror:0.56275	test-merror:0.57700
[200]	train-merror:0.56097	test-merror:0.57650
[220]	train-merror:0.55915	test-merror:0.57630
[240]	train-

In [37]:
print('CV accuracy: ', acc)

CV accuracy:  0.4252884391938148


In [None]:
# max voting with preference for the first model when predictions are equally frequent
from scipy.stats import mode

pred_test_full1 = np.array(pred_test_full).T
pred_test_full1 = mode(pred_test_full1, 1)[0]
pred_test_full1 = pred_test_full1.reshape((pred_test_full1.shape[0], ))
print(pred_test_full1)

In [49]:
# transform pred_test_full into original labels
out_df = pd.DataFrame({"case_id": df_test_pre["case_id"].values})
out_df["Stay"] = le.inverse_transform(pred_test_full1)
out_df.to_csv("../model/Mbaseline_out.csv", index=False)

In [84]:
# LGB
params = {'rounds': 600, 'depth': 7, 'eta': 0.05}
%time pred_val_full, acc, pred_test_full, cv_scores = trainModel(x_train, y_train, x_test, 3, "LGB", feat_cols, **params)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 100 rounds
[20]	valid_0's multi_error: 0.592787
[40]	valid_0's multi_error: 0.581586
[60]	valid_0's multi_error: 0.578496
[80]	valid_0's multi_error: 0.577799
[100]	valid_0's multi_error: 0.576659
[120]	valid_0's multi_error: 0.575782
[140]	valid_0's multi_error: 0.575358
[160]	valid_0's multi_error: 0.575123
[180]	valid_0's multi_error: 0.575189
[200]	valid_0's multi_error: 0.574803
[220]	valid_0's multi_error: 0.57484
[240]	valid_0's multi_error: 0.574511
[260]	valid_0's multi_error: 0.574708
[280]	valid_0's multi_error: 0.574341
[300]	valid_0's multi_error: 0.574501
[320]	valid_0's multi_error: 0.574011
[340]	valid_0's multi_error: 0.573992
[360]	valid_0's multi_error: 0.57387
[380]	valid_0's multi_error: 0.57371
[400]	valid_0's multi_error: 0.573531
[420]	valid_0's multi_error: 0.573324
[440]	valid_0's multi_err

In [85]:
acc, cv_scores

(0.425392070041892,
 [0.42690256816083505, 0.42489589810261336, 0.4243777438622275])

In [91]:
# max voting with preference for the first model when predictions are equally frequent
from scipy.stats import mode

pred_test_full1 = np.array(pred_test_full).T
pred_test_full1 = mode(pred_test_full1, 1)[0]
pred_test_full1 = pred_test_full1.reshape((pred_test_full1.shape[0], ))
print(pred_test_full1)

[0 5 2 ... 2 1 3]


In [92]:
# transform pred_test_full into original labels
out_df = pd.DataFrame({"case_id": df_test_pre["case_id"].values})
out_df["Stay"] = le.inverse_transform(pred_test_full1)
out_df.to_csv("../model/Mbaseline_LGB_out.csv", index=False)