In [1]:
import pandas as pd
import numpy as np
import os, sys, joblib, time
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import KFold

In [2]:
# GLOBALS
LOCAL_ROOT = '/Users/nathvaru/Documents/personal/AV/janatahack_healthcare_analytics_II/'
DATA_DIR = os.path.join(LOCAL_ROOT, 'data')
TRAIN_FN = os.path.join(DATA_DIR, 'Train_hMYJ020/train.csv')
TEST_FN = os.path.join(DATA_DIR, 'test.csv')
SUBMISSION_FN = os.path.join(DATA_DIR, 'sample_submission_lfbv3c3.csv')

In [3]:
# read data
df_train = pd.read_csv(TRAIN_FN)
df_test = pd.read_csv(TEST_FN)

In [4]:
cat_vars = ['Hospital_code', 'Hospital_type_code',
            'City_Code_Hospital', 'Hospital_region_code',
            'Department', 'Ward_Type', 'Ward_Facility_Code',
            'Bed Grade', 'City_Code_Patient',
            'Type of Admission', 'Severity of Illness', 'Age']
num_vars = ['Available Extra Rooms in Hospital',
            'Visitors with Patient', 'Admission_Deposit']

In [5]:
# Impute missing values in Bed Grade and City_Code_Patient
# with separate category
df_train.fillna({'Bed Grade': 'missing', 'City_Code_Patient': 'missing'},
                inplace=True)
df_test.fillna({'Bed Grade': 'missing', 'City_Code_Patient': 'missing'},
                inplace=True)

In [6]:
def getCountVar(compute_df, count_df, var_name, count_var):
    """
    compute_df : Data frame for which the count encoding should be done
    count_df : Data frame from which the counts should be taken
    var_name : categorical variable for count encoding
    count_var : some other variable from the dataset (used as dummy variable to get count)
    """
    grouped_df = count_df.groupby(var_name, as_index=False)[count_var].agg('count')
    grouped_df.columns = [var_name, "var_count"]
    merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
    merged_df.fillna(-1, inplace=True)
    return list(merged_df["var_count"])


def getDVEncodeVar(compute_df, target_df, var_name, target_var):
    if type(var_name) != type([]):
        var_name = [var_name]
    grouped_df = target_df.groupby(var_name)[target_var].agg(["mean"]).reset_index()
    grouped_df.columns = var_name + ["mean_value"]
    merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
    merged_df.fillna(-1, inplace=True)
    return list(merged_df["mean_value"])


def do_target_encode(train_df, test_df, cols_to_encode, target_col, encode_type, n_splits=3):
        
    kf = KFold(n_splits=n_splits, shuffle=True,
                               random_state=2020)
    for col in cols_to_encode:
        train_enc_values = np.zeros(train_df.shape[0])
        test_enc_values = 0
        for dev_index, val_index in kf.split(train_df):
            new_train_df = train_df[[col, target_col]]
            dev_X, val_X = new_train_df.iloc[dev_index], new_train_df.iloc[val_index]
            
            if encode_type == 'dv':
                train_enc_values[val_index] =  np.array( 
                    getDVEncodeVar(val_X[[col]], dev_X, col, target_col))
                test_enc_values += np.array( 
                    getDVEncodeVar(test_df[[col]], dev_X, col, target_col))
            elif encode_type == 'count':
                train_enc_values[val_index] =  np.array( 
                    getCountVar(val_X[[col]], dev_X, col, target_col))
                test_enc_values += np.array( 
                    getCountVar(test_df[[col]], dev_X, col, target_col))
        
        test_enc_values /= n_splits
        train_df[col + "_{}_enc_{}".format(encode_type, target_col)] = train_enc_values
        test_df[col + "_{}_enc_{}".format(encode_type, target_col)] = test_enc_values
        
    return train_df, test_df

In [7]:
# preprocess cat_vars
for var in cat_vars:
    if df_train[var].dtypes == object:
        print(var)
        df_train[var] = df_train[var].apply(
            lambda x: str(x).strip().replace(" ", "-").replace(".", ""))
        df_test[var] = df_test[var].apply(
            lambda x: str(x).strip().replace(" ", "-").replace(".", ""))

Hospital_type_code
Hospital_region_code
Department
Ward_Type
Ward_Facility_Code
Bed Grade
City_Code_Patient
Type of Admission
Severity of Illness
Age


In [8]:
# outlier treatment and scaling for num_vars
from utility import LegacyOutlierScaler
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


PREPROCESS = {
    'exoutscaler': LegacyOutlierScaler(),
    'stdscaler': StandardScaler()
}
STEPS = ['exoutscaler', 'stdscaler']


def preprocess(train, test, steps, features):
    """
    imputation, outlier treatment and scaling
    """
    train = train.copy()
    test = test.copy()
    other_cols = list(set(list(test.columns)) - set(features))
    classic_steps = steps
    steps = list(zip(steps, map(PREPROCESS.get, steps)))
    datapipe = Pipeline(steps=steps)

    x_dev = train[features].values
    
    print('fit')
    datapipe.fit(x_dev)
    
    print('transform dataframe using pipeline')
    print('train data:')
    train1 = datapipe.transform(train[features].values)
    train1 = pd.DataFrame(train1, columns=features)
    train1 = pd.concat([train1, train[other_cols+['Stay']]], axis=1)
    print('test data:')
    test1 = datapipe.transform(test[features].values)
    test1 = pd.DataFrame(test1, columns=features)
    test1 = pd.concat([test1, test[other_cols]], axis=1)
    
    # Create "classic" datapipe and store list of features
    classic_pipe = Pipeline([(name, datapipe.named_steps[name])
                             for name in classic_steps])
    classic_pipe.feature_names = features

    return train1, test1, classic_pipe

In [9]:
df_train_pre, df_test_pre, pipeline = preprocess(df_train, df_test, STEPS, num_vars)

fit
transform dataframe using pipeline
train data:
test data:


In [10]:
# encode target
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(df_train_pre['Stay'].values)

df_train_pre['DV'] = le.transform(df_train_pre['Stay'].values)

In [11]:
# feature engineering
# mean of numeric vars by certain cat_vars
cat_vars_enc = ['Bed Grade', 'Age', 'Hospital_code', 'City_Code_Patient']
num_vars_enc = ['Available Extra Rooms in Hospital', 'Visitors with Patient',
                'Admission_Deposit']

for target_col in num_vars_enc:
    print('Target Col: %s' % (target_col))
    df_train_pre, df_test_pre = do_target_encode(df_train_pre, df_test_pre, cat_vars_enc,
                                                 target_col, 'dv', 3)
    print('\n')

Target Col: Available Extra Rooms in Hospital


Target Col: Visitors with Patient


Target Col: Admission_Deposit




In [12]:
# ratio features

def get_ratio_cols(a, b):
    mask = (a.notnull()) & (b.notnull()) & (b != 0)
    series = pd.Series([-999]*len(a))
    series[mask] = list(map(lambda x, y: 1.*x/y, a[mask], b[mask]))
    return series


f1_f2_lst = [('Available Extra Rooms in Hospital',
              'Bed Grade_dv_enc_Available Extra Rooms in Hospital'),
             ('Available Extra Rooms in Hospital',
              'Age_dv_enc_Available Extra Rooms in Hospital'),
             ('Available Extra Rooms in Hospital',
              'Hospital_code_dv_enc_Available Extra Rooms in Hospital'),
             ('Available Extra Rooms in Hospital',
              'City_Code_Patient_dv_enc_Available Extra Rooms in Hospital'),
             ('Visitors with Patient', 'Bed Grade_dv_enc_Visitors with Patient'),
             ('Visitors with Patient', 'Age_dv_enc_Visitors with Patient'),
             ('Visitors with Patient', 'Hospital_code_dv_enc_Visitors with Patient'),
             ('Visitors with Patient', 'City_Code_Patient_dv_enc_Visitors with Patient'),
             ('Admission_Deposit', 'Bed Grade_dv_enc_Admission_Deposit'),
             ('Admission_Deposit', 'Age_dv_enc_Admission_Deposit'),
             ('Admission_Deposit', 'Hospital_code_dv_enc_Admission_Deposit'),
             ('Admission_Deposit', 'City_Code_Patient_dv_enc_Admission_Deposit')]

for f1, f2 in f1_f2_lst:
    print('{}_{}'.format(f1, f2))
    
    print('Train\n')
    a, b = df_train_pre[f1], df_train_pre[f2]
    name = f1 + '_RATIO_' + f2
    df_train_pre[name] = get_ratio_cols(a, b)
    
    print('Test\n')
    a, b = df_test_pre[f1], df_test_pre[f2]
    df_test_pre[name] = get_ratio_cols(a, b)
    
    print('\n')

Available Extra Rooms in Hospital_Bed Grade_dv_enc_Available Extra Rooms in Hospital
Train

Test



Available Extra Rooms in Hospital_Age_dv_enc_Available Extra Rooms in Hospital
Train

Test



Available Extra Rooms in Hospital_Hospital_code_dv_enc_Available Extra Rooms in Hospital
Train

Test



Available Extra Rooms in Hospital_City_Code_Patient_dv_enc_Available Extra Rooms in Hospital
Train

Test



Visitors with Patient_Bed Grade_dv_enc_Visitors with Patient
Train

Test



Visitors with Patient_Age_dv_enc_Visitors with Patient
Train

Test



Visitors with Patient_Hospital_code_dv_enc_Visitors with Patient
Train

Test



Visitors with Patient_City_Code_Patient_dv_enc_Visitors with Patient
Train

Test



Admission_Deposit_Bed Grade_dv_enc_Admission_Deposit
Train

Test



Admission_Deposit_Age_dv_enc_Admission_Deposit
Train

Test



Admission_Deposit_Hospital_code_dv_enc_Admission_Deposit
Train

Test



Admission_Deposit_City_Code_Patient_dv_enc_Admission_Deposit
Train

Test





In [13]:
df_train_pre.shape, df_test_pre.shape

((318438, 43), (137057, 41))

In [14]:
# OHE cat_vars

df_train_pre['sample'] = 'train'
df_test_pre['sample'] = 'test'
cols = ['case_id', 'sample'] + cat_vars
other_cols = [col for col in list(df_test_pre.columns) if col not in cols]
tmp = pd.concat([df_train_pre[cols], df_test_pre[cols]], axis=0)
tmp.reset_index(drop=True, inplace=True)

print('get dummies')
tmp = pd.get_dummies(tmp, prefix=cat_vars, columns=cat_vars,
                     prefix_sep='_', drop_first=True)

mask = tmp['sample'] == 'train'
train = tmp.loc[mask, :]
train.reset_index(drop=True, inplace=True)
train.drop('sample', axis=1, inplace=True)
df_train_pre = pd.merge(df_train_pre[['case_id', 'Stay', 'DV']+other_cols], train,
                        on='case_id')
del train

mask = tmp['sample'] == 'test'
test = tmp.loc[mask, :]
test.reset_index(drop=True, inplace=True)
test.drop('sample', axis=1, inplace=True)
df_test_pre = pd.merge(df_test_pre[['case_id']+other_cols], test, on='case_id')
del test
del tmp

get dummies


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [15]:
# add prefix to all features
FEAT_PREFIX = 'JHA'
cols = list(df_test_pre.columns)
new_cols = [FEAT_PREFIX + '_'+ col.replace(" ", "-")
            if col not in ('case_id', 'Stay', 'DV') else col for col in cols]
rename_dct = dict(zip(cols, new_cols))
df_train_pre.rename(columns=rename_dct, inplace=True)
df_test_pre.rename(columns=rename_dct, inplace=True)

In [16]:
df_train_pre.drop('JHA_patientid', axis=1, inplace=True)
df_test_pre.drop('JHA_patientid', axis=1, inplace=True)

In [17]:
df_train_pre.columns

Index(['case_id', 'Stay', 'DV', 'JHA_Available-Extra-Rooms-in-Hospital',
       'JHA_Visitors-with-Patient', 'JHA_Admission_Deposit',
       'JHA_Bed-Grade_dv_enc_Available-Extra-Rooms-in-Hospital',
       'JHA_Age_dv_enc_Available-Extra-Rooms-in-Hospital',
       'JHA_Hospital_code_dv_enc_Available-Extra-Rooms-in-Hospital',
       'JHA_City_Code_Patient_dv_enc_Available-Extra-Rooms-in-Hospital',
       ...
       'JHA_Severity-of-Illness_Moderate', 'JHA_Age_11-20', 'JHA_Age_21-30',
       'JHA_Age_31-40', 'JHA_Age_41-50', 'JHA_Age_51-60', 'JHA_Age_61-70',
       'JHA_Age_71-80', 'JHA_Age_81-90', 'JHA_Age_91-100'],
      dtype='object', length=147)

In [18]:
# ratio of num visitors and admission deposit
for f1, f2 in [('JHA_Visitors-with-Patient', 'JHA_Admission_Deposit'),
               ('JHA_Visitors-with-Patient', 'JHA_Available-Extra-Rooms-in-Hospital')]:
    print('{}_{}'.format(f1, f2))
    
    print('Train\n')
    a, b = df_train_pre[f1], df_train_pre[f2]
    name = f1 + '_RATIO_' + f2
    df_train_pre[name] = get_ratio_cols(a, b)
    
    print('Test\n')
    a, b = df_test_pre[f1], df_test_pre[f2]
    df_test_pre[name] = get_ratio_cols(a, b)
    
    print('\n')

JHA_Visitors-with-Patient_JHA_Admission_Deposit
Train

Test



JHA_Visitors-with-Patient_JHA_Available-Extra-Rooms-in-Hospital
Train

Test





In [19]:
feat_cols = [x for x in list(df_train_pre.columns) if x.startswith(FEAT_PREFIX)]
print(len(feat_cols))
x_train = df_train_pre[feat_cols]
y_train = df_train_pre['DV']
x_test = df_test_pre[feat_cols]

146


In [21]:
# feature selection

from sklearn.svm import SVC
from sklearn.feature_selection import RFECV

svc = SVC(kernel="linear", C=1)
rfe = RFECV(estimator=svc, min_features_to_select=60, step=5, verbose=1, n_jobs=-1, cv=3)
%time rfe.fit(x_train, y_train)

KeyboardInterrupt: 

In [47]:
# modelling
import xgboost as xgb
import lightgbm as lgb
from sklearn import metrics
import operator
from catboost import Pool, CatBoostClassifier


def create_feature_map(features):
    outfile = open('../model/xgb.fmap', 'w')
    for i, feat in enumerate(features):
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
    outfile.close()


def runXGB(train_X, train_y, test_X, test_y=None, test_X2=None,
           feature_names=None, seed_val=0, rounds=500, dep=8, eta=0.05):
    params = {}
    params["objective"] = "multi:softmax"
    params["num_class"] = 11
    params["nthread"] = 8
    params['eval_metric'] = "merror"
    params["eta"] = eta
    params["subsample"] = 0.7
    params["min_child_weight"] = 1
    params["colsample_bytree"] = 0.7
    params["max_depth"] = dep

    params["silent"] = 1
    params["seed"] = seed_val
    # params["max_delta_step"] = 2
    # params["gamma"] = 0.5
    num_rounds = rounds

    plst = list(params.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [(xgtrain, 'train'), (xgtest, 'test')]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist,
                          early_stopping_rounds=100, verbose_eval=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    if feature_names is not None:
        create_feature_map(feature_names)
        model.dump_model('../model/xgbmodel.txt', '../model/xgb.fmap',
                         with_stats=True)
        importance = model.get_fscore(fmap='../model/xgb.fmap')
        importance = sorted(importance.items(), key=operator.itemgetter(1),
                            reverse=True)
        imp_df = pd.DataFrame(importance, columns=['feature', 'fscore'])
        imp_df['fscore'] = imp_df['fscore'] / imp_df['fscore'].sum()
        imp_df.to_csv("imp_feat.txt", index=False)

    pred_test_y = model.predict(xgtest,
                                ntree_limit=model.best_ntree_limit)
    if test_X2 is not None:
        pred_test_y2 = model.predict(xgb.DMatrix(test_X2),
                                     ntree_limit=model.best_ntree_limit)
    else:
        pred_test_y2 = None

    loss = 0
    if test_y is not None:
        loss = metrics.accuracy_score(test_y, pred_test_y)

    return pred_test_y, loss, pred_test_y2


def runLGB(train_X, train_y, test_X, test_y=None, test_X2=None,
           feature_names=None, seed_val=0, rounds=500, dep=8, eta=0.05):
    params = {}
    params["objective"] = "multiclass"
    params["num_class"] = 11
    params['metric'] = "multi_error"
    params['seed'] = seed_val
    params["max_depth"] = dep
    params["num_leaves"] = 40
    params["min_data_in_leaf"] = 10
    params["learning_rate"] = eta
    params["bagging_fraction"] = 0.7
    params["feature_fraction"] = 0.7
    params["bagging_freq"] = 5
    params["bagging_seed"] = seed_val
    params["verbosity"] = 0
    num_rounds = rounds

    lgtrain = lgb.Dataset(train_X, label=train_y)

    if test_y is not None:
        lgtest = lgb.Dataset(test_X, label=test_y)
        model = lgb.train(params, lgtrain, num_rounds, valid_sets=[lgtest],
                          early_stopping_rounds=100, verbose_eval=20)
    else:
        lgtest = lgb.DMatrix(test_X)
        model = lgb.train(params, lgtrain, num_rounds)

    pred_test_y = model.predict(test_X,
                                num_iteration=model.best_iteration)
    pred_test_y = pred_test_y.argmax(axis=1)
    
    if test_X2 is not None:
        pred_test_y2 = model.predict(test_X2,
                                     num_iteration=model.best_iteration)
        pred_test_y2 = pred_test_y2.argmax(axis=1)
    else:
        pred_test_y2 = None
        
    loss = 0
    if test_y is not None:
        loss = metrics.accuracy_score(test_y, pred_test_y)

    return pred_test_y, loss, pred_test_y2

In [48]:
# Model building

def trainModel(train_X, train_y, test_X, n_splits, model_name, feats, 
               **params):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=2020)
    cv_scores = []
    pred_test_full = []
    pred_val_full = np.zeros(train_X.shape[0])
    for dev_index, val_index in kf.split(train_X):
        dev_X, val_X = train_X.iloc[dev_index, :], train_X.iloc[val_index, :]
        dev_y, val_y = train_y[dev_index], train_y[val_index]

        if model_name == "XGB":
            pred_val, acc, pred_test = runXGB(
             dev_X, dev_y, val_X, val_y, test_X, rounds=params['rounds'],
             dep=params['depth'], eta=params['eta'], feature_names=feats)
        elif model_name == "LGB":
            pred_val, acc, pred_test = runLGB(
             dev_X, dev_y, val_X, val_y, test_X, rounds=params['rounds'],
             dep=params['depth'], eta=params['eta'])
        
        cv_scores.append(acc)
        pred_val_full[val_index] = pred_val
        if pred_test is not None:
            pred_test_full.append(pred_test)

    #pred_test_full = pred_test_full/n_splits
    acc = metrics.accuracy_score(train_y, pred_val_full)
    return pred_val_full, acc, pred_test_full, cv_scores

In [51]:
# LGB
params = {'rounds': 600, 'depth': 7, 'eta': 0.05}
%time pred_val_full, acc, pred_test_full, cv_scores = trainModel(x_train, y_train, x_test, 3, "LGB", feat_cols, **params)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
Training until validation scores don't improve for 100 rounds
[20]	valid_0's multi_error: 0.586268
[40]	valid_0's multi_error: 0.5807
[60]	valid_0's multi_error: 0.579381
[80]	valid_0's multi_error: 0.577733
[100]	valid_0's multi_error: 0.577309
[120]	valid_0's multi_error: 0.576819
[140]	valid_0's multi_error: 0.576131
[160]	valid_0's multi_error: 0.576536


KeyboardInterrupt: 