In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import statistics

## Loading Data

In [2]:
#Load train data
train_data = pd.read_csv('data/train_data.csv')
train_data.head()

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,1,8,c,3,Z,3,radiotherapy,R,F,2.0,31397,7.0,Emergency,Extreme,2,51-60,4911.0,0-10
1,2,2,c,5,Z,2,radiotherapy,S,F,2.0,31397,7.0,Trauma,Extreme,2,51-60,5954.0,41-50
2,3,10,e,1,X,2,anesthesia,S,E,2.0,31397,7.0,Trauma,Extreme,2,51-60,4745.0,31-40
3,4,26,b,2,Y,2,radiotherapy,R,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,7272.0,41-50
4,5,26,b,2,Y,2,radiotherapy,S,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,5558.0,41-50


In [4]:
train_data['Age'].value_counts()

41-50     63749
31-40     63639
51-60     48514
21-30     40843
71-80     35792
61-70     33687
11-20     16768
81-90      7890
0-10       6254
91-100     1302
Name: Age, dtype: int64

## Basic Data Cleaning

In [3]:
#Data Types of Different Columns
train_data.dtypes

case_id                                int64
Hospital_code                          int64
Hospital_type_code                    object
City_Code_Hospital                     int64
Hospital_region_code                  object
Available Extra Rooms in Hospital      int64
Department                            object
Ward_Type                             object
Ward_Facility_Code                    object
Bed Grade                            float64
patientid                              int64
City_Code_Patient                    float64
Type of Admission                     object
Severity of Illness                   object
Visitors with Patient                  int64
Age                                   object
Admission_Deposit                    float64
Stay                                  object
dtype: object

In [4]:
#Searching for null values
train_data.isna().sum()

case_id                                 0
Hospital_code                           0
Hospital_type_code                      0
City_Code_Hospital                      0
Hospital_region_code                    0
Available Extra Rooms in Hospital       0
Department                              0
Ward_Type                               0
Ward_Facility_Code                      0
Bed Grade                             113
patientid                               0
City_Code_Patient                    4532
Type of Admission                       0
Severity of Illness                     0
Visitors with Patient                   0
Age                                     0
Admission_Deposit                       0
Stay                                    0
dtype: int64

In [5]:
#Fill null values with 0's
train_data['Bed Grade'] = train_data['Bed Grade'].fillna(0)
train_data['City_Code_Patient'] = train_data['City_Code_Patient'].fillna(0)

In [6]:
#Confirmation that there are no more null values
train_data.isna().sum()

case_id                              0
Hospital_code                        0
Hospital_type_code                   0
City_Code_Hospital                   0
Hospital_region_code                 0
Available Extra Rooms in Hospital    0
Department                           0
Ward_Type                            0
Ward_Facility_Code                   0
Bed Grade                            0
patientid                            0
City_Code_Patient                    0
Type of Admission                    0
Severity of Illness                  0
Visitors with Patient                0
Age                                  0
Admission_Deposit                    0
Stay                                 0
dtype: int64

In [7]:
#Dropping irrelevant columns
train_data.drop(['case_id', 'patientid'], axis=1, inplace=True)

In [8]:
#Exploring the Stay Column
train_data['Stay'].value_counts()

21-30                 87491
11-20                 78139
31-40                 55159
51-60                 35018
0-10                  23604
41-50                 11743
71-80                 10254
More than 100 Days     6683
81-90                  4838
91-100                 2765
61-70                  2744
Name: Stay, dtype: int64

In [9]:
#Replace the more than 100 days category to something more uniform
train_data['Stay'].replace('More than 100 Days', '100+', inplace=True)

In [10]:
train_data['Stay'].value_counts()

21-30     87491
11-20     78139
31-40     55159
51-60     35018
0-10      23604
41-50     11743
71-80     10254
100+       6683
81-90      4838
91-100     2765
61-70      2744
Name: Stay, dtype: int64

## Feature Engineering

In [11]:
#Divide columns into categorical and numerical
x_categorical_columns = []
y_column = []
x_numerical_columns = []

for column in train_data.columns:
    if train_data[column].dtypes == 'object':
        x_categorical_columns.append(column)
        
for column in train_data.columns:
    if train_data[column].dtypes != 'object':
        x_numerical_columns.append(column)
        
print(x_categorical_columns)
print(x_numerical_columns)

['Hospital_type_code', 'Hospital_region_code', 'Department', 'Ward_Type', 'Ward_Facility_Code', 'Type of Admission', 'Severity of Illness', 'Age', 'Stay']
['Hospital_code', 'City_Code_Hospital', 'Available Extra Rooms in Hospital', 'Bed Grade', 'City_Code_Patient', 'Visitors with Patient', 'Admission_Deposit']


In [12]:
#Some columns in the numerical columns are list are in reality more like categorical variables
x_categorical_columns.append('Bed Grade')
x_categorical_columns.append('Hospital_code')
x_categorical_columns.append('City_Code_Hospital')
x_categorical_columns.append('City_Code_Patient')

In [13]:
#remove "Stay" from x categorical because it is a y value
x_categorical_columns.remove('Stay')
x_categorical_columns

['Hospital_type_code',
 'Hospital_region_code',
 'Department',
 'Ward_Type',
 'Ward_Facility_Code',
 'Type of Admission',
 'Severity of Illness',
 'Age',
 'Bed Grade',
 'Hospital_code',
 'City_Code_Hospital',
 'City_Code_Patient']

In [14]:
#Remove the above columns from the numerical columns list
x_numerical_columns.remove('Bed Grade')
x_numerical_columns.remove('Hospital_code')
x_numerical_columns.remove('City_Code_Hospital')
x_numerical_columns.remove("City_Code_Patient")
x_numerical_columns

['Available Extra Rooms in Hospital',
 'Visitors with Patient',
 'Admission_Deposit']

In [15]:
#set y_column
y_column.append('Stay')
y_column

['Stay']

In [16]:
x_categorical_columns

['Hospital_type_code',
 'Hospital_region_code',
 'Department',
 'Ward_Type',
 'Ward_Facility_Code',
 'Type of Admission',
 'Severity of Illness',
 'Age',
 'Bed Grade',
 'Hospital_code',
 'City_Code_Hospital',
 'City_Code_Patient']

In [17]:
from sklearn.preprocessing import LabelEncoder

x_label_encoder = {}
y_label_encoder = LabelEncoder()



for column in x_categorical_columns:
    x_label_encoder[column] = LabelEncoder()
    train_data[column] = x_label_encoder[column].fit_transform(train_data[column])
    
for column in y_column:
    train_data[column] = y_label_encoder.fit_transform(train_data[column])
    


In [None]:
#y_label_encoder.inverse_transform(predictions)

In [26]:
train_data[x_categorical_columns]

Unnamed: 0,Hospital_type_code,Hospital_region_code,Department,Ward_Type,Ward_Facility_Code,Type of Admission,Severity of Illness,Age,Bed Grade,Hospital_code,City_Code_Hospital,City_Code_Patient
0,2,2,3,2,5,0,0,5,2,7,2,7
1,2,2,3,3,5,1,0,5,2,1,4,7
2,4,0,1,3,4,1,0,5,2,9,0,7
3,1,1,3,2,3,1,0,5,2,25,1,7
4,1,1,3,3,3,1,0,5,2,25,1,7
...,...,...,...,...,...,...,...,...,...,...,...,...
318433,0,0,3,1,5,0,2,4,4,5,5,22
318434,0,0,1,1,4,2,2,8,4,23,0,8
318435,0,0,2,2,5,0,1,7,4,6,3,10
318436,1,1,1,1,3,1,1,1,3,10,1,8


In [27]:
#Scaling of numerical columns
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

train_data[x_numerical_columns] = scaler.fit_transform(train_data[x_numerical_columns].values)

In [28]:
train_data.head()

Unnamed: 0,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,7,2,2,2,-0.169177,3,2,5,2,7,0,0,-0.727923,5,0.027835,0
1,1,2,4,2,-1.025217,3,3,5,2,7,1,0,-0.727923,5,0.987556,5
2,9,4,0,0,-1.025217,1,3,4,2,7,1,0,-0.727923,5,-0.12491,4
3,25,1,1,1,-1.025217,3,2,3,2,7,1,0,-0.727923,5,2.200319,5
4,25,1,1,1,-1.025217,3,3,3,2,7,1,0,-0.727923,5,0.623175,5


In [29]:
#Declaring x and y variables
y = train_data['Stay']
X = train_data.drop('Stay', axis=1)

In [30]:
from keras.utils import to_categorical

Using TensorFlow backend.


In [55]:
#Attempt to one-hot-encode y
# onehot_y = to_categorical(y)
# onehot_y

In [36]:
#Now that the data is cleaned, label encoded, and scaled we use train/test/split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Decision Tree

In [37]:
#Set up random forest classifier
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=500)

In [38]:
#Fit the random forest classifier to the train data
rfc = rfc.fit(X_train, y_train)

In [39]:
rfc.score(X_test, y_test)

0.3846564501946992

In [None]:
#Set up grid search parameters
# param_grid = {
#     'max_depth': [i for i in range(5, 10)],
#     'n_estimators': [i for i in range(200, 210) if i % 2 == 0]
# }

In [None]:
#Applying grid search
# from sklearn.model_selection import GridSearchCV

# grid = GridSearchCV(rfc, param_grid, verbose=3)
# grid.fit(X_train, y_train)

# Neural Network

In [87]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()

#Inputs and hidden layers
model.add(Dense(units=25, activation='relu', input_dim=15))
model.add(Dense(units=50, activation='relu'))
model.add(Dense(units=50, activation='relu'))
          
          
#Outputs
model.add((Dense(units=11, activation='softmax')))



In [88]:
model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 25)                400       
_________________________________________________________________
dense_13 (Dense)             (None, 50)                1300      
_________________________________________________________________
dense_14 (Dense)             (None, 50)                2550      
_________________________________________________________________
dense_15 (Dense)             (None, 11)                561       
Total params: 4,811
Trainable params: 4,811
Non-trainable params: 0
_________________________________________________________________


In [89]:
#Compile the Model
model.compile(optimizer='adam',
             loss='sparse_categorical_crossentropy',
             metrics=['accuracy'])

In [90]:
#Fit model
model.fit(
    X_train,
    y_train,
    epochs=100,
    shuffle=True,
    verbose=2,
    validation_data = (X_test, y_test)
)

Train on 254750 samples, validate on 63688 samples
Epoch 1/100


KeyboardInterrupt: 

In [86]:
model_loss, model_accuracy = model.evaluate(
    X_test, y_test, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

63688/63688 - 3s - loss: 1.6146 - accuracy: 0.3776
Normal Neural Network - Loss: 1.6146164258780333, Accuracy: 0.37759074568748474


# Preparing New Data To Test the Models

In [40]:
#Setting up the test data
new_data = pd.read_csv('data/test_data.csv')
new_data.head()

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit
0,318439,21,c,3,Z,3,gynecology,S,A,2.0,17006,2.0,Emergency,Moderate,2,71-80,3095.0
1,318440,29,a,4,X,2,gynecology,S,F,2.0,17006,2.0,Trauma,Moderate,4,71-80,4018.0
2,318441,26,b,2,Y,3,gynecology,Q,D,4.0,17006,2.0,Emergency,Moderate,3,71-80,4492.0
3,318442,6,a,6,X,3,gynecology,Q,F,2.0,17006,2.0,Trauma,Moderate,3,71-80,4173.0
4,318443,28,b,11,X,2,gynecology,R,F,2.0,17006,2.0,Trauma,Moderate,4,71-80,4161.0


In [41]:
#Column Types
new_data.dtypes

case_id                                int64
Hospital_code                          int64
Hospital_type_code                    object
City_Code_Hospital                     int64
Hospital_region_code                  object
Available Extra Rooms in Hospital      int64
Department                            object
Ward_Type                             object
Ward_Facility_Code                    object
Bed Grade                            float64
patientid                              int64
City_Code_Patient                    float64
Type of Admission                     object
Severity of Illness                   object
Visitors with Patient                  int64
Age                                   object
Admission_Deposit                    float64
dtype: object

In [42]:
#Dropping of irrelevant columns
new_data.drop(['case_id', 'patientid'], axis=1, inplace=True)

In [43]:
#See which columns have null values
new_data.isna().sum()

Hospital_code                           0
Hospital_type_code                      0
City_Code_Hospital                      0
Hospital_region_code                    0
Available Extra Rooms in Hospital       0
Department                              0
Ward_Type                               0
Ward_Facility_Code                      0
Bed Grade                              35
City_Code_Patient                    2157
Type of Admission                       0
Severity of Illness                     0
Visitors with Patient                   0
Age                                     0
Admission_Deposit                       0
dtype: int64

In [44]:
#Fill null columns with 0 value
new_data['Bed Grade'] = new_data['Bed Grade'].fillna(0)
new_data['City_Code_Patient'] = new_data['City_Code_Patient'].fillna(0)

In [45]:
#Confirm
new_data.isna().sum()

Hospital_code                        0
Hospital_type_code                   0
City_Code_Hospital                   0
Hospital_region_code                 0
Available Extra Rooms in Hospital    0
Department                           0
Ward_Type                            0
Ward_Facility_Code                   0
Bed Grade                            0
City_Code_Patient                    0
Type of Admission                    0
Severity of Illness                  0
Visitors with Patient                0
Age                                  0
Admission_Deposit                    0
dtype: int64

In [46]:
#Divide columns into categorical and numerical
categorical_columns_new = []
numerical_columns_new = []

for column in new_data.columns:
    if new_data[column].dtypes == 'object':
        categorical_columns_new.append(column)
        
for column in new_data.columns:
    if new_data[column].dtypes != 'object':
        numerical_columns_new.append(column)
        
print(categorical_columns_new)
print(numerical_columns_new)


['Hospital_type_code', 'Hospital_region_code', 'Department', 'Ward_Type', 'Ward_Facility_Code', 'Type of Admission', 'Severity of Illness', 'Age']
['Hospital_code', 'City_Code_Hospital', 'Available Extra Rooms in Hospital', 'Bed Grade', 'City_Code_Patient', 'Visitors with Patient', 'Admission_Deposit']


In [47]:
#Convert some of the numeric ones to categorical
categorical_columns_new.append('Bed Grade')
categorical_columns_new.append('Hospital_code')
categorical_columns_new.append('City_Code_Hospital')
categorical_columns_new.append('City_Code_Patient')

In [48]:
#Remove the above columns from the numeric list
numerical_columns_new.remove('Bed Grade')
numerical_columns_new.remove('Hospital_code')
numerical_columns_new.remove('City_Code_Hospital')
numerical_columns_new.remove("City_Code_Patient")
numerical_columns_new

['Available Extra Rooms in Hospital',
 'Visitors with Patient',
 'Admission_Deposit']

In [52]:
#Label Encode categorical columns in test_data

new_x_label_encoder = {}



for column in categorical_columns_new:
    new_x_label_encoder[column] = LabelEncoder()
    new_data[column] =new_x_label_encoder[column].fit_transform(new_data[column])

    


In [53]:
new_data[categorical_columns_new]

Unnamed: 0,Hospital_type_code,Hospital_region_code,Department,Ward_Type,Ward_Facility_Code,Type of Admission,Severity of Illness,Age,Bed Grade,Hospital_code,City_Code_Hospital,City_Code_Patient
0,2,2,2,3,0,0,2,7,2,20,2,2
1,0,0,2,3,5,1,2,7,2,28,3,2
2,1,1,2,1,3,0,2,7,4,25,1,2
3,0,0,2,1,5,1,2,7,2,5,5,2
4,1,0,2,2,5,1,2,7,2,27,9,2
...,...,...,...,...,...,...,...,...,...,...,...,...
137052,1,1,1,1,3,0,1,4,3,10,1,3
137053,4,0,3,2,4,0,2,0,4,24,0,7
137054,2,2,1,2,0,2,1,0,4,29,2,12
137055,0,0,1,2,4,1,1,4,4,4,0,10


In [54]:
#Scale numeric columns in test data
new_data[numerical_columns_new] = scaler.fit_transform(new_data[numerical_columns_new].values)
new_data.head()

Unnamed: 0,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit
0,20,2,2,2,-0.165503,2,3,0,2,2,0,2,-0.722758,7,-1.64211
1,28,0,3,0,-1.024428,2,3,5,2,2,1,2,0.402568,7,-0.788083
2,25,1,1,1,-0.165503,2,1,3,4,2,0,2,-0.160095,7,-0.349504
3,5,0,5,0,-0.165503,2,1,5,2,2,1,2,-0.160095,7,-0.644666
4,27,1,9,0,-1.024428,2,2,5,2,2,1,2,0.402568,7,-0.655769
