In [1]:
import pandas as pd
import numpy as np
import os, sys, joblib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
# GLOBALS
LOCAL_ROOT = '/Users/nathvaru/Documents/personal/AV/janatahack_healthcare_analytics_II/'
DATA_DIR = os.path.join(LOCAL_ROOT, 'data')
TRAIN_FN = os.path.join(DATA_DIR, 'Train_hMYJ020/train.csv')
TEST_FN = os.path.join(DATA_DIR, 'test.csv')
SUBMISSION_FN = os.path.join(DATA_DIR, 'sample_submission_lfbv3c3.csv')

In [3]:
# read data
df_train = pd.read_csv(TRAIN_FN)
df_test = pd.read_csv(TEST_FN)

In [4]:
print(df_train.shape)
print(df_test.shape)
print(df_train.head())

(318438, 18)
(137057, 17)
   case_id  Hospital_code Hospital_type_code  City_Code_Hospital  \
0        1              8                  c                   3   
1        2              2                  c                   5   
2        3             10                  e                   1   
3        4             26                  b                   2   
4        5             26                  b                   2   

  Hospital_region_code  Available Extra Rooms in Hospital    Department  \
0                    Z                                  3  radiotherapy   
1                    Z                                  2  radiotherapy   
2                    X                                  2    anesthesia   
3                    Y                                  2  radiotherapy   
4                    Y                                  2  radiotherapy   

  Ward_Type Ward_Facility_Code  Bed Grade  patientid  City_Code_Patient  \
0         R                  F        2

In [5]:
assert df_train.case_id.nunique() == df_train.shape[0]

In [15]:
print('# classes: ', df_train['Stay'].nunique())
df_train['Stay'].value_counts().apply(lambda x: 100.*x/df_train.shape[0])

# classes:  11


21-30                 27.475050
11-20                 24.538215
31-40                 17.321739
51-60                 10.996803
0-10                   7.412432
41-50                  3.687688
71-80                  3.220093
More than 100 Days     2.098682
81-90                  1.519291
91-100                 0.868301
61-70                  0.861706
Name: Stay, dtype: float64

In [7]:
100. * df_train.isnull().mean()

case_id                              0.000000
Hospital_code                        0.000000
Hospital_type_code                   0.000000
City_Code_Hospital                   0.000000
Hospital_region_code                 0.000000
Available Extra Rooms in Hospital    0.000000
Department                           0.000000
Ward_Type                            0.000000
Ward_Facility_Code                   0.000000
Bed Grade                            0.035486
patientid                            0.000000
City_Code_Patient                    1.423197
Type of Admission                    0.000000
Severity of Illness                  0.000000
Visitors with Patient                0.000000
Age                                  0.000000
Admission_Deposit                    0.000000
Stay                                 0.000000
dtype: float64

### Bed Grade and City_Code_Patient have missing values

In [8]:
cat_vars = ['Hospital_code', 'Hospital_type_code',
            'City_Code_Hospital', 'Hospital_region_code',
            'Department', 'Ward_Type', 'Ward_Facility_Code',
            'Bed Grade', 'patientid', 'City_Code_Patient',
            'Type of Admission', 'Severity of Illness', 'Age']
num_vars = ['Available Extra Rooms in Hospital',
            'Visitors with Patient', 'Admission_Deposit']

In [9]:
for var in num_vars:
    print('Variable: ', var)
    print('# unique values: ', df_train[var].nunique())
    print('Summary Stats:')
    print(df_train[var].describe())
    print('\n')

Variable:  Available Extra Rooms in Hospital
# unique values:  18
Summary Stats:
count    318438.000000
mean          3.197627
std           1.168171
min           0.000000
25%           2.000000
50%           3.000000
75%           4.000000
max          24.000000
Name: Available Extra Rooms in Hospital, dtype: float64


Variable:  Visitors with Patient
# unique values:  28
Summary Stats:
count    318438.000000
mean          3.284099
std           1.764061
min           0.000000
25%           2.000000
50%           3.000000
75%           4.000000
max          32.000000
Name: Visitors with Patient, dtype: float64


Variable:  Admission_Deposit
# unique values:  7300
Summary Stats:
count    318438.000000
mean       4880.749392
std        1086.776254
min        1800.000000
25%        4186.000000
50%        4741.000000
75%        5409.000000
max       11008.000000
Name: Admission_Deposit, dtype: float64




In [11]:
for var in cat_vars:
    print('Variable: ', var)
    print('# unique values: ', df_train[var].nunique())
    print('DV crosstab: ')
    tab = pd.crosstab(df_train[var], df_train['Stay'],
                      normalize='index')
    print(tab)
    print('\n')

Variable:  Hospital_code
# unique values:  32
DV crosstab: 
Stay               0-10     11-20     21-30     31-40     41-50     51-60  \
Hospital_code                                                               
1              0.070299  0.202324  0.254334  0.178510  0.040960  0.136407   
2              0.053704  0.190122  0.190514  0.187181  0.029988  0.184829   
3              0.111720  0.275998  0.280073  0.156970  0.032040  0.080663   
4              0.161290  0.327419  0.298387  0.110484  0.025000  0.050806   
5              0.063106  0.270481  0.296141  0.176582  0.028702  0.104353   
6              0.055080  0.275496  0.294443  0.165288  0.026536  0.108152   
7              0.154671  0.359877  0.249617  0.128637  0.022971  0.055896   
8              0.059241  0.247611  0.346710  0.165711  0.035217  0.082446   
9              0.066464  0.239791  0.265421  0.176195  0.028584  0.137098   
10             0.077160  0.282353  0.290832  0.156969  0.033492  0.089136   
11             0

Stay                    0-10     11-20     21-30     31-40     41-50  \
Ward_Facility_Code                                                     
A                   0.085358  0.219845  0.267899  0.179280  0.042930   
B                   0.074155  0.223973  0.245449  0.182643  0.031005   
C                   0.110566  0.340298  0.221837  0.157657  0.016778   
D                   0.075643  0.190006  0.291436  0.179544  0.064680   
E                   0.062980  0.272461  0.295514  0.166483  0.030785   
F                   0.064646  0.240677  0.284365  0.174071  0.033746   

Stay                   51-60     61-70     71-80     81-90    91-100  \
Ward_Facility_Code                                                     
A                   0.113058  0.009138  0.034652  0.014621  0.010428   
B                   0.143901  0.007111  0.039282  0.021845  0.009643   
C                   0.090940  0.003891  0.020331  0.011279  0.004399   
D                   0.091471  0.015364  0.037966  0.014901  0.0

Stay        0-10     11-20     21-30     31-40     41-50     51-60     61-70  \
Age                                                                            
0-10    0.098337  0.313240  0.238088  0.162136  0.029901  0.093060  0.004157   
11-20   0.092557  0.318643  0.257156  0.159888  0.030415  0.085222  0.005308   
21-30   0.084886  0.275984  0.278971  0.169233  0.034229  0.092868  0.006439   
31-40   0.077248  0.248150  0.291488  0.171467  0.037288  0.102406  0.007998   
41-50   0.074150  0.234655  0.280883  0.172285  0.039326  0.112770  0.008816   
51-60   0.070639  0.233871  0.269159  0.176629  0.035763  0.118296  0.009234   
61-70   0.065129  0.233621  0.268145  0.176032  0.035770  0.121145  0.009648   
71-80   0.061494  0.222340  0.266372  0.179370  0.038640  0.123854  0.010561   
81-90   0.053485  0.176426  0.243346  0.190621  0.048035  0.137136  0.014575   
91-100  0.063748  0.190476  0.226575  0.179724  0.050691  0.132873  0.022273   

Stay       71-80     81-90    91-100  M

### patientid has a lot of values. So, it needs to be encoded
1. number of cases per patient
2. number of times a patient stayed in each of the Stay buckets
3. mean number of times by stay buckets

### Ideas for the baseline model
1. Ignore patientid as there's no intersection b/w train and test
2. OHE each of the cat_vars
3. Impute missing values in Bed Grade and City_Code_Patient with separate category
4. Outlier treatment and scaling for num_vars
5. encode Stay with LabelEncoder