In [1]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
import pandas as pd
from sklearn.svm import LinearSVC
pd.set_option('display.max_columns', 100)
import os
import seaborn as sns
sns.set()
import numpy as np
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = 8, 5
plt.style.use("fivethirtyeight")
for dirname, _, filenames in os.walk('HealthAnalytics2'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
train_df = pd.read_csv('Data/train.csv')
test_df = pd.read_csv('Data/test.csv')
sub_df = pd.read_csv('Data/sample_submission_lfbv3c3.csv')

In [3]:
# Training data.
print('Training data shape: ', train_df.shape)
train_df.head(5)

# Test data.
print('Test data shape: ', test_df.shape)
test_df.head(5)

Training data shape:  (318438, 18)
Test data shape:  (137057, 17)


Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit
0,318439,21,c,3,Z,3,gynecology,S,A,2.0,17006,2.0,Emergency,Moderate,2,71-80,3095.0
1,318440,29,a,4,X,2,gynecology,S,F,2.0,17006,2.0,Trauma,Moderate,4,71-80,4018.0
2,318441,26,b,2,Y,3,gynecology,Q,D,4.0,17006,2.0,Emergency,Moderate,3,71-80,4492.0
3,318442,6,a,6,X,3,gynecology,Q,F,2.0,17006,2.0,Trauma,Moderate,3,71-80,4173.0
4,318443,28,b,11,X,2,gynecology,R,F,2.0,17006,2.0,Trauma,Moderate,4,71-80,4161.0


In [4]:
# Null values and Data types.
print('Train Set')
print(train_df.info())
print('-------------')
print('Test Set')
print(test_df.info())

Train Set
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 318438 entries, 0 to 318437
Data columns (total 18 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   case_id                            318438 non-null  int64  
 1   Hospital_code                      318438 non-null  int64  
 2   Hospital_type_code                 318438 non-null  object 
 3   City_Code_Hospital                 318438 non-null  int64  
 4   Hospital_region_code               318438 non-null  object 
 5   Available Extra Rooms in Hospital  318438 non-null  int64  
 6   Department                         318438 non-null  object 
 7   Ward_Type                          318438 non-null  object 
 8   Ward_Facility_Code                 318438 non-null  object 
 9   Bed Grade                          318325 non-null  float64
 10  patientid                          318438 non-null  int64  
 11  City_Code_Patient            

In [5]:
print(train_df.isnull().sum())
print(test_df.isnull().sum())

train_df['Bed Grade'].fillna(train_df['Bed Grade'].mode()[0], inplace=True)
train_df['City_Code_Patient'].fillna(train_df['City_Code_Patient'].mode()[0], inplace=True)
test_df['City_Code_Patient'].fillna(test_df['City_Code_Patient'].mode()[0], inplace=True)
test_df['Bed Grade'].fillna(test_df['Bed Grade'].mode()[0], inplace=True)

print(train_df.isnull().sum())
print(test_df.isnull().sum())

case_id                                 0
Hospital_code                           0
Hospital_type_code                      0
City_Code_Hospital                      0
Hospital_region_code                    0
Available Extra Rooms in Hospital       0
Department                              0
Ward_Type                               0
Ward_Facility_Code                      0
Bed Grade                             113
patientid                               0
City_Code_Patient                    4532
Type of Admission                       0
Severity of Illness                     0
Visitors with Patient                   0
Age                                     0
Admission_Deposit                       0
Stay                                    0
dtype: int64
case_id                                 0
Hospital_code                           0
Hospital_type_code                      0
City_Code_Hospital                      0
Hospital_region_code                    0
Available Extra Rooms

In [6]:
# Total number of Patients in the dataset(train+test)
print("Total Patients in Train set: ",train_df['patientid'].nunique())
print("Total Patients in Test set: ",test_df['patientid'].nunique())

Total Patients in Train set:  92017
Total Patients in Test set:  39607


In [7]:
print(train_df.columns)
print(test_df.columns)

Index(['case_id', 'Hospital_code', 'Hospital_type_code', 'City_Code_Hospital',
       'Hospital_region_code', 'Available Extra Rooms in Hospital',
       'Department', 'Ward_Type', 'Ward_Facility_Code', 'Bed Grade',
       'patientid', 'City_Code_Patient', 'Type of Admission',
       'Severity of Illness', 'Visitors with Patient', 'Age',
       'Admission_Deposit', 'Stay'],
      dtype='object')
Index(['case_id', 'Hospital_code', 'Hospital_type_code', 'City_Code_Hospital',
       'Hospital_region_code', 'Available Extra Rooms in Hospital',
       'Department', 'Ward_Type', 'Ward_Facility_Code', 'Bed Grade',
       'patientid', 'City_Code_Patient', 'Type of Admission',
       'Severity of Illness', 'Visitors with Patient', 'Age',
       'Admission_Deposit'],
      dtype='object')


In [8]:
for col in train_df.columns:
    print('Number of unique values of '+col+' column in train_df dataset are {} '.format(train_df[col].nunique()))
    print('The unique values of '+col+' column in train_df dataset are {} '.format(train_df[col].unique()))

for col in test_df.columns:
    print('Number of unique values of '+col+' column in train_df dataset are {} '.format(test_df[col].nunique()))
    print('The unique values of '+col+' column in train_df dataset are {} '.format(test_df[col].unique()))
    
print(train_df['Type of Admission'].value_counts(normalize=True))
print(train_df['Severity of Illness'].value_counts(normalize=True))
print(train_df['Department'].value_counts(normalize=True))

Number of unique values of case_id column in train_df dataset are 318438 
The unique values of case_id column in train_df dataset are [     1      2      3 ... 318436 318437 318438] 
Number of unique values of Hospital_code column in train_df dataset are 32 
The unique values of Hospital_code column in train_df dataset are [ 8  2 10 26 23 32  1 22 16  9  6 29 12  3 21 28 27 19  5 14 13 31 24 17
 25 15 11 30 18  4  7 20] 
Number of unique values of Hospital_type_code column in train_df dataset are 7 
The unique values of Hospital_type_code column in train_df dataset are ['c' 'e' 'b' 'a' 'f' 'd' 'g'] 
Number of unique values of City_Code_Hospital column in train_df dataset are 11 
The unique values of City_Code_Hospital column in train_df dataset are [ 3  5  1  2  6  9 10  4 11  7 13] 
Number of unique values of Hospital_region_code column in train_df dataset are 3 
The unique values of Hospital_region_code column in train_df dataset are ['Z' 'X' 'Y'] 
Number of unique values of Availabl

In [9]:
print(train_df.dtypes)
print(test_df.dtypes)


case_id                                int64
Hospital_code                          int64
Hospital_type_code                    object
City_Code_Hospital                     int64
Hospital_region_code                  object
Available Extra Rooms in Hospital      int64
Department                            object
Ward_Type                             object
Ward_Facility_Code                    object
Bed Grade                            float64
patientid                              int64
City_Code_Patient                    float64
Type of Admission                     object
Severity of Illness                   object
Visitors with Patient                  int64
Age                                   object
Admission_Deposit                      int64
Stay                                  object
dtype: object
case_id                                int64
Hospital_code                          int64
Hospital_type_code                    object
City_Code_Hospital                     in

In [10]:
# Target class distribution.
print(train_df['Stay'].value_counts())
print(train_df['Stay'].value_counts(normalize=True))
print(train_df['Stay'].unique())
print(train_df['Stay'].nunique())

le = LabelEncoder()
train_df['Stay'] = le.fit_transform(train_df['Stay'])

21-30                 87491
20-Nov                78139
31-40                 55159
51-60                 35018
0-10                  23604
41-50                 11743
71-80                 10254
More than 100 Days     6683
81-90                  4838
91-100                 2765
61-70                  2744
Name: Stay, dtype: int64
21-30                 0.274751
20-Nov                0.245382
31-40                 0.173217
51-60                 0.109968
0-10                  0.074124
41-50                 0.036877
71-80                 0.032201
More than 100 Days    0.020987
81-90                 0.015193
91-100                0.008683
61-70                 0.008617
Name: Stay, dtype: float64
['0-10' '41-50' '31-40' '20-Nov' '51-60' '21-30' '71-80'
 'More than 100 Days' '81-90' '61-70' '91-100']
11


In [11]:
hospital_type_map={
 'a': 0,
 'b': 1,
 'c': 2,
 'e': 3,
 'd': 4,
 'f': 5,
 'g': 6}

hospital_region_map = {'X': 0, 'Y': 1, 'Z': 2}

dep_map={'gynecology': 0,
 'anesthesia': 1,
 'radiotherapy': 2,
 'TB & Chest disease': 3,
 'surgery': 4}

ward_type_map ={'R': 0, 'Q': 1, 'S': 2, 'P': 3, 'T': 4, 'U': 5}
ward_fac_map ={'F':0, 'E': 1, 'D':2, 'C': 3, 'B': 4, 'A': 5}
admiss_map = {'Trauma': 0, 'Emergency': 1, 'Urgent': 2}
Severity_of_Illness={'Minor': 1,'Moderate ': 2,'Extreme': 3}

Age = {'0-10': 5,
       '11-20': 15,
       '21-30': 25,
       '31-40': 35,
       '41-50': 45,
       '51-60': 55,
       '61-70': 65,
       '71-80': 75,
       '81-90': 85,
       '91-100': 95
        }

stay_map = {'21-30': 2,
 '11-20': 1,
 '31-40': 3,
 '51-60': 5,
 '0-10': 0,
 '41-50': 4,
 '71-80': 7,
 'More than 100 Days': 10,
 '81-90': 8,
 '91-100': 9,
 '61-70': 6}

cols = ['Hospital_type_code',
       'Hospital_region_code',
       'Department', 'Ward_Type', 'Ward_Facility_Code',
        'Type of Admission',
       'Severity of Illness','Age'
       ]
for col in cols:
    if train_df[col].dtype==object:
        print(col)
        lbl = LabelEncoder()
        train_df[col] = lbl.fit_transform(train_df[col])

for col in cols:
    if test_df[col].dtype==object:
        print(col)
        lbl = LabelEncoder()
        test_df[col] = lbl.fit_transform(test_df[col])

Hospital_type_code
Hospital_region_code
Department
Ward_Type
Ward_Facility_Code
Type of Admission
Severity of Illness
Age
Hospital_type_code
Hospital_region_code
Department
Ward_Type
Ward_Facility_Code
Type of Admission
Severity of Illness
Age


In [12]:
# 'Hospital_code','Hospital_type_code','City_Code_Hospital','Hospital_region_code','Available Extra Rooms in Hospital',
# 'Department','Ward_Type','Ward_Facility_Code','Bed Grade','City_Code_Patient','Type of Admission','Severity of Illness',
# 'Visitors with Patient','Age'.

# Total visits of a patient to hospital, total no. of visitors to patient.
Encoding = train_df.groupby('patientid')['case_id'].count()
print(Encoding)
# train_df['total_visits']= train_df['patientid'].map(Encoding)
# Encoding = train_df.groupby('patientid')['Visitors with Patient'].sum()
# train_df['total_visitors']= train_df['patientid'].map(Encoding)
# Encoding = train_df.groupby('patientid')['Visitors with Patient'].mean()
# train_df['avg_visitors']= train_df['patientid'].map(Encoding)
# print(train_df['total_visits'])



patientid
1         4
2         2
4         2
6         1
7         4
         ..
131620    9
131621    3
131622    4
131623    2
131624    3
Name: case_id, Length: 92017, dtype: int64


In [13]:
X = train_df.drop(['Stay','case_id', 'patientid'],axis = 1)
Y = train_df['Stay']
test_X = test_df.drop([ 'case_id', 'patientid'],axis = 1)

fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=2020)
i = 1
err = []

for train_index, test_index in fold.split(X, Y):
    x_train, x_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = Y.iloc[train_index], Y.iloc[test_index]

    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_val = scaler.transform(x_val)
    test_X = scaler.transform(test_X)

    m = LGBMClassifier()
    m.fit(x_train, y_train)
    pred_y = m.predict(x_val)
    err.append(accuracy_score(y_val,pred_y))
    i = i + 1

print(err)
print(sum(err)/10)

[0.4265481723401583, 0.42218314282125363, 0.42281120462253485, 0.42582590126868486, 0.42623414143951766, 0.4278671021228489, 0.4227798015324708, 0.42840095465393796, 0.4260591024715008, 0.4239236252865622]
0.425263314855947
