In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import statistics

In [None]:
from google.colab import files

In [None]:
uploaded = files.upload()

Saving train_data.csv to train_data.csv


In [None]:
import io

## Loading Data

In [None]:
#Load train data
train_data = pd.read_csv(io.BytesIO(uploaded['train_data.csv']))
train_data.head()

Unnamed: 0,case_id,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,patientid,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,1,8,c,3,Z,3,radiotherapy,R,F,2.0,31397,7.0,Emergency,Extreme,2,51-60,4911.0,0-10
1,2,2,c,5,Z,2,radiotherapy,S,F,2.0,31397,7.0,Trauma,Extreme,2,51-60,5954.0,41-50
2,3,10,e,1,X,2,anesthesia,S,E,2.0,31397,7.0,Trauma,Extreme,2,51-60,4745.0,31-40
3,4,26,b,2,Y,2,radiotherapy,R,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,7272.0,41-50
4,5,26,b,2,Y,2,radiotherapy,S,D,2.0,31397,7.0,Trauma,Extreme,2,51-60,5558.0,41-50


## Basic Data Cleaning

In [None]:
#Data Types of Different Columns
train_data.dtypes

case_id                                int64
Hospital_code                          int64
Hospital_type_code                    object
City_Code_Hospital                     int64
Hospital_region_code                  object
Available Extra Rooms in Hospital      int64
Department                            object
Ward_Type                             object
Ward_Facility_Code                    object
Bed Grade                            float64
patientid                              int64
City_Code_Patient                    float64
Type of Admission                     object
Severity of Illness                   object
Visitors with Patient                  int64
Age                                   object
Admission_Deposit                    float64
Stay                                  object
dtype: object

In [None]:
#Searching for null values
train_data.isna().sum()

case_id                                 0
Hospital_code                           0
Hospital_type_code                      0
City_Code_Hospital                      0
Hospital_region_code                    0
Available Extra Rooms in Hospital       0
Department                              0
Ward_Type                               0
Ward_Facility_Code                      0
Bed Grade                             113
patientid                               0
City_Code_Patient                    4532
Type of Admission                       0
Severity of Illness                     0
Visitors with Patient                   0
Age                                     0
Admission_Deposit                       0
Stay                                    0
dtype: int64

In [None]:
#Fill null values with 0's
train_data['Bed Grade'] = train_data['Bed Grade'].fillna(0)
train_data['City_Code_Patient'] = train_data['City_Code_Patient'].fillna(0)

In [None]:
#Confirmation that there are no more null values
train_data.isna().sum()

case_id                              0
Hospital_code                        0
Hospital_type_code                   0
City_Code_Hospital                   0
Hospital_region_code                 0
Available Extra Rooms in Hospital    0
Department                           0
Ward_Type                            0
Ward_Facility_Code                   0
Bed Grade                            0
patientid                            0
City_Code_Patient                    0
Type of Admission                    0
Severity of Illness                  0
Visitors with Patient                0
Age                                  0
Admission_Deposit                    0
Stay                                 0
dtype: int64

In [None]:
#Dropping irrelevant columns
train_data.drop(['case_id', 'patientid'], axis=1, inplace=True)

In [None]:
#Exploring the Stay Column
train_data['Stay'].value_counts()

21-30                 87491
11-20                 78139
31-40                 55159
51-60                 35018
0-10                  23604
41-50                 11743
71-80                 10254
More than 100 Days     6683
81-90                  4838
91-100                 2765
61-70                  2744
Name: Stay, dtype: int64

In [None]:
#Replace the more than 100 days category to something more uniform
train_data['Stay'].replace('More than 100 Days', '100+', inplace=True)

In [None]:
train_data['Stay'].value_counts()

21-30     87491
11-20     78139
31-40     55159
51-60     35018
0-10      23604
41-50     11743
71-80     10254
100+       6683
81-90      4838
91-100     2765
61-70      2744
Name: Stay, dtype: int64

## Feature Engineering

In [None]:
#Divide columns into categorical and numerical
x_categorical_columns = []
y_column = []
x_numerical_columns = []

for column in train_data.columns:
    if train_data[column].dtypes == 'object':
        x_categorical_columns.append(column)
        
for column in train_data.columns:
    if train_data[column].dtypes != 'object':
        x_numerical_columns.append(column)
        
print(x_categorical_columns)
print(x_numerical_columns)

['Hospital_type_code', 'Hospital_region_code', 'Department', 'Ward_Type', 'Ward_Facility_Code', 'Type of Admission', 'Severity of Illness', 'Age', 'Stay']
['Hospital_code', 'City_Code_Hospital', 'Available Extra Rooms in Hospital', 'Bed Grade', 'City_Code_Patient', 'Visitors with Patient', 'Admission_Deposit']


In [None]:
#Some columns in the numerical columns are list are in reality more like categorical variables
x_categorical_columns.append('Bed Grade')
x_categorical_columns.append('Hospital_code')
x_categorical_columns.append('City_Code_Hospital')
x_categorical_columns.append('City_Code_Patient')

In [None]:
#remove "Stay" from x categorical because it is a y value
x_categorical_columns.remove('Stay')
x_categorical_columns

['Hospital_type_code',
 'Hospital_region_code',
 'Department',
 'Ward_Type',
 'Ward_Facility_Code',
 'Type of Admission',
 'Severity of Illness',
 'Age',
 'Bed Grade',
 'Hospital_code',
 'City_Code_Hospital',
 'City_Code_Patient']

In [None]:
#Remove the above columns from the numerical columns list
x_numerical_columns.remove('Bed Grade')
x_numerical_columns.remove('Hospital_code')
x_numerical_columns.remove('City_Code_Hospital')
x_numerical_columns.remove('City_Code_Patient')
x_numerical_columns

['Available Extra Rooms in Hospital',
 'Visitors with Patient',
 'Admission_Deposit']

In [None]:
#set y_column
y_column.append('Stay')
y_column

['Stay']

In [None]:
x_categorical_columns

['Hospital_type_code',
 'Hospital_region_code',
 'Department',
 'Ward_Type',
 'Ward_Facility_Code',
 'Type of Admission',
 'Severity of Illness',
 'Age',
 'Bed Grade',
 'Hospital_code',
 'City_Code_Hospital',
 'City_Code_Patient']

In [None]:
raw_data = train_data.copy()

In [None]:
from sklearn.preprocessing import LabelEncoder

x_label_encoder = {}
y_label_encoder = LabelEncoder()



for column in x_categorical_columns:
    x_label_encoder[column] = LabelEncoder()
    train_data[column] = x_label_encoder[column].fit_transform(train_data[column])
    
for column in y_column:
    train_data[column] = y_label_encoder.fit_transform(train_data[column])
    


In [None]:
x_label_encoder.keys()

dict_keys(['Hospital_type_code', 'Hospital_region_code', 'Department', 'Ward_Type', 'Ward_Facility_Code', 'Type of Admission', 'Severity of Illness', 'Age', 'Bed Grade', 'Hospital_code', 'City_Code_Hospital', 'City_Code_Patient'])

In [None]:
train_data[x_categorical_columns]

Unnamed: 0,Hospital_type_code,Hospital_region_code,Department,Ward_Type,Ward_Facility_Code,Type of Admission,Severity of Illness,Age,Bed Grade,Hospital_code,City_Code_Hospital,City_Code_Patient
0,2,2,3,2,5,0,0,5,2,7,2,7
1,2,2,3,3,5,1,0,5,2,1,4,7
2,4,0,1,3,4,1,0,5,2,9,0,7
3,1,1,3,2,3,1,0,5,2,25,1,7
4,1,1,3,3,3,1,0,5,2,25,1,7
...,...,...,...,...,...,...,...,...,...,...,...,...
318433,0,0,3,1,5,0,2,4,4,5,5,22
318434,0,0,1,1,4,2,2,8,4,23,0,8
318435,0,0,2,2,5,0,1,7,4,6,3,10
318436,1,1,1,1,3,1,1,1,3,10,1,8


In [None]:
#Scaling of numerical columns
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

train_data[x_numerical_columns] = scaler.fit_transform(train_data[x_numerical_columns].values)

In [None]:
train_data.head()

Unnamed: 0,Hospital_code,Hospital_type_code,City_Code_Hospital,Hospital_region_code,Available Extra Rooms in Hospital,Department,Ward_Type,Ward_Facility_Code,Bed Grade,City_Code_Patient,Type of Admission,Severity of Illness,Visitors with Patient,Age,Admission_Deposit,Stay
0,7,2,2,2,-0.169177,3,2,5,2,7,0,0,-0.727923,5,0.027835,0
1,1,2,4,2,-1.025217,3,3,5,2,7,1,0,-0.727923,5,0.987556,5
2,9,4,0,0,-1.025217,1,3,4,2,7,1,0,-0.727923,5,-0.12491,4
3,25,1,1,1,-1.025217,3,2,3,2,7,1,0,-0.727923,5,2.200319,5
4,25,1,1,1,-1.025217,3,3,3,2,7,1,0,-0.727923,5,0.623175,5


In [None]:
#Declaring x and y variables
y = train_data['Stay']
X = train_data.drop('Stay', axis=1)

In [None]:
order = list(X.columns)

In [None]:
#Now that the data is cleaned, label encoded, and scaled we use train/test/split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Neural Network

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.callbacks import ModelCheckpoint
import os


def create_model():

  model = Sequential()

  #Inputs and hidden layers
  model.add(Dense(units=50, activation='relu', input_dim=15))
  model.add(Dense(units=100, activation='relu'))
  model.add(Dense(units=50, activation='relu'))
  model.add(Dense(units=15, activation='relu'))
  model.add(Dense(units=50, activation='relu')) 
  model.add(Dense(units=100, activation='relu')) 
  model.add(Dense(units = 50, activation='relu'))       
            
  #Outputs
  model.add((Dense(units=11, activation='softmax')))

  optimizer = SGD(learning_rate = 0.01, momentum = 0)

  model.compile(optimizer=optimizer,
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
  
  return model

In [None]:
model = create_model()

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 50)                800       
_________________________________________________________________
dense_1 (Dense)              (None, 100)               5100      
_________________________________________________________________
dense_2 (Dense)              (None, 50)                5050      
_________________________________________________________________
dense_3 (Dense)              (None, 15)                765       
_________________________________________________________________
dense_4 (Dense)              (None, 50)                800       
_________________________________________________________________
dense_5 (Dense)              (None, 100)               5100      
_________________________________________________________________
dense_6 (Dense)              (None, 50)                5

In [None]:
#Fit model

checkpoint_path = 'training_1/cp-{epoch:04d}.ckpt'

checkpoint_dir = os.path.dirname(checkpoint_path)

batch_size = 32

cp_callback = ModelCheckpoint(
    filepath = checkpoint_path,
    verbose = 0, 
    save_weights_only = True   
)

model = create_model()

model.save_weights(checkpoint_path.format(epoch=0))

model.fit(
    X_train,
    y_train,
    epochs=50,
    shuffle=True,
    verbose=2,
    validation_data = (X_test, y_test),
    batch_size = batch_size, 
    callbacks = [cp_callback]
)

Epoch 1/50
7961/7961 - 13s - loss: 1.7111 - accuracy: 0.3399 - val_loss: 1.6228 - val_accuracy: 0.3785
Epoch 2/50
7961/7961 - 11s - loss: 1.6163 - accuracy: 0.3798 - val_loss: 1.5956 - val_accuracy: 0.3884
Epoch 3/50
7961/7961 - 11s - loss: 1.5983 - accuracy: 0.3871 - val_loss: 1.5855 - val_accuracy: 0.3934
Epoch 4/50
7961/7961 - 11s - loss: 1.5852 - accuracy: 0.3924 - val_loss: 1.5756 - val_accuracy: 0.3977
Epoch 5/50
7961/7961 - 10s - loss: 1.5766 - accuracy: 0.3969 - val_loss: 1.5749 - val_accuracy: 0.3943
Epoch 6/50
7961/7961 - 10s - loss: 1.5690 - accuracy: 0.4002 - val_loss: 1.5625 - val_accuracy: 0.4024
Epoch 7/50
7961/7961 - 11s - loss: 1.5631 - accuracy: 0.4018 - val_loss: 1.5913 - val_accuracy: 0.3897
Epoch 8/50
7961/7961 - 11s - loss: 1.5582 - accuracy: 0.4039 - val_loss: 1.5491 - val_accuracy: 0.4081
Epoch 9/50
7961/7961 - 11s - loss: 1.5540 - accuracy: 0.4053 - val_loss: 1.5528 - val_accuracy: 0.4073
Epoch 10/50
7961/7961 - 11s - loss: 1.5496 - accuracy: 0.4063 - val_loss:

<tensorflow.python.keras.callbacks.History at 0x7ff81e182e90>

In [None]:
import os 
[a for a in os.listdir(checkpoint_dir) if '47' in a]

['cp-0047.ckpt.data-00000-of-00001', 'cp-0047.ckpt.index']

In [None]:
print(checkpoint_dir)

training_1


In [None]:
model_1 = create_model()
model_1.load_weights('training_1/cp-0047.ckpt')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7ff822218090>

In [None]:
model_loss, model_accuracy = model_1.evaluate(
    X_test, y_test, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

1991/1991 - 2s - loss: 1.5243 - accuracy: 0.4172
Normal Neural Network - Loss: 1.52434241771698, Accuracy: 0.4172057509422302


In [None]:
y_translator = {a:b for (a,b) in zip(range(11), y_label_encoder.inverse_transform(range(11)))}
y_translator

{0: '0-10',
 1: '100+',
 2: '11-20',
 3: '21-30',
 4: '31-40',
 5: '41-50',
 6: '51-60',
 7: '61-70',
 8: '71-80',
 9: '81-90',
 10: '91-100'}

In [None]:
scale_translator = {a: {'standard_deviation': float(b), 'mean': float(c)} for (a,b,c) in zip(x_numerical_columns, scaler.var_, scaler.mean_)}
scale_translator

{'Admission_Deposit': {'mean': 4880.749392346391,
  'standard_deviation': 1181078.9164916168},
 'Available Extra Rooms in Hospital': {'mean': 3.1976271676119055,
  'standard_deviation': 1.3646201994983251},
 'Visitors with Patient': {'mean': 3.2840992595104854,
  'standard_deviation': 3.111902813705039}}

In [None]:
# X_translator = {c: {b:int(a) for (a,b) in zip([d.item() for d in list(x_label_encoder[c].transform(x_label_encoder[c].classes_))], x_label_encoder[c].classes_)} for c in x_label_encoder.keys()}
X_translator = {}
for a,b in x_label_encoder.items():
  value = b.transform(b.classes_)
  value = value.tolist()
  key = b.classes_.tolist()
  translation = {e:d for (e,d) in zip(key,value)}
  X_translator[a] = translation

X_translator


{'Age': {'0-10': 0,
  '11-20': 1,
  '21-30': 2,
  '31-40': 3,
  '41-50': 4,
  '51-60': 5,
  '61-70': 6,
  '71-80': 7,
  '81-90': 8,
  '91-100': 9},
 'Bed Grade': {0.0: 0, 1.0: 1, 2.0: 2, 3.0: 3, 4.0: 4},
 'City_Code_Hospital': {1: 0,
  2: 1,
  3: 2,
  4: 3,
  5: 4,
  6: 5,
  7: 6,
  9: 7,
  10: 8,
  11: 9,
  13: 10},
 'City_Code_Patient': {0.0: 0,
  1.0: 1,
  2.0: 2,
  3.0: 3,
  4.0: 4,
  5.0: 5,
  6.0: 6,
  7.0: 7,
  8.0: 8,
  9.0: 9,
  10.0: 10,
  11.0: 11,
  12.0: 12,
  13.0: 13,
  14.0: 14,
  15.0: 15,
  16.0: 16,
  18.0: 17,
  19.0: 18,
  20.0: 19,
  21.0: 20,
  22.0: 21,
  23.0: 22,
  24.0: 23,
  25.0: 24,
  26.0: 25,
  27.0: 26,
  28.0: 27,
  29.0: 28,
  30.0: 29,
  31.0: 30,
  32.0: 31,
  33.0: 32,
  34.0: 33,
  35.0: 34,
  36.0: 35,
  37.0: 36,
  38.0: 37},
 'Department': {'TB & Chest disease': 0,
  'anesthesia': 1,
  'gynecology': 2,
  'radiotherapy': 3,
  'surgery': 4},
 'Hospital_code': {1: 0,
  2: 1,
  3: 2,
  4: 3,
  5: 4,
  6: 5,
  7: 6,
  8: 7,
  9: 8,
  10: 9,
  11: 10

In [None]:
for a in X_translator['City_Code_Patient'].keys():
  print(type(a))
  break
c = 'City_Code_Patient'
[d.item() for d in list(x_label_encoder[c].transform(x_label_encoder[c].classes_))]

<class 'float'>


[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37]

In [None]:
translators = {
    'X_translator': X_translator,
    'y_translator': y_translator,
    'scale_translator': scale_translator,
    'data_order': order
}

In [None]:
import json
with open('translators.json', 'w') as f:
  json.dump(translators, f)

In [None]:
translators = None
with open('translators.json', 'r') as f:
  translators=json.load(f)
print(translators)

{'X_translator': {'Hospital_type_code': {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6}, 'Hospital_region_code': {'X': 0, 'Y': 1, 'Z': 2}, 'Department': {'TB & Chest disease': 0, 'anesthesia': 1, 'gynecology': 2, 'radiotherapy': 3, 'surgery': 4}, 'Ward_Type': {'P': 0, 'Q': 1, 'R': 2, 'S': 3, 'T': 4, 'U': 5}, 'Ward_Facility_Code': {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5}, 'Type of Admission': {'Emergency': 0, 'Trauma': 1, 'Urgent': 2}, 'Severity of Illness': {'Extreme': 0, 'Minor': 1, 'Moderate': 2}, 'Age': {'0-10': 0, '11-20': 1, '21-30': 2, '31-40': 3, '41-50': 4, '51-60': 5, '61-70': 6, '71-80': 7, '81-90': 8, '91-100': 9}, 'Bed Grade': {'0.0': 0, '1.0': 1, '2.0': 2, '3.0': 3, '4.0': 4}, 'Hospital_code': {'1': 0, '2': 1, '3': 2, '4': 3, '5': 4, '6': 5, '7': 6, '8': 7, '9': 8, '10': 9, '11': 10, '12': 11, '13': 12, '14': 13, '15': 14, '16': 15, '17': 16, '18': 17, '19': 18, '20': 19, '21': 20, '22': 21, '23': 22, '24': 23, '25': 24, '26': 25, '27': 26, '28': 27, '29':

In [None]:
x_categorical_columns

['Hospital_type_code',
 'Hospital_region_code',
 'Department',
 'Ward_Type',
 'Ward_Facility_Code',
 'Type of Admission',
 'Severity of Illness',
 'Age',
 'Bed Grade',
 'Hospital_code',
 'City_Code_Hospital',
 'City_Code_Patient']

In [None]:
x_numerical_columns

['Available Extra Rooms in Hospital',
 'Visitors with Patient',
 'Admission_Deposit']

In [None]:
input = raw_data.iloc[0].to_dict()
input.pop('Stay')
input

{'Admission_Deposit': 4911.0,
 'Age': '51-60',
 'Available Extra Rooms in Hospital': 3,
 'Bed Grade': 2.0,
 'City_Code_Hospital': 3,
 'City_Code_Patient': 7.0,
 'Department': 'radiotherapy',
 'Hospital_code': 8,
 'Hospital_region_code': 'Z',
 'Hospital_type_code': 'c',
 'Severity of Illness': 'Extreme',
 'Type of Admission': 'Emergency',
 'Visitors with Patient': 2,
 'Ward_Facility_Code': 'F',
 'Ward_Type': 'R'}

In [None]:
[i for i in raw_data.columns if i not in X_train.columns]

['Stay']

In [None]:
def predict(input):
  translators = None
  with open('translators.json', 'r') as f:
    translators=json.load(f)

  X_translator = translators['X_translator']
  scale_translator = translators['scale_translator']
  order = translators['data_order']
  y_translator = translators['y_translator']
  input_t = {}
  for (category, value) in input.items():
    if category in x_categorical_columns:
      input_t[category] = X_translator[category][str(value)]
    elif category in x_numerical_columns:
      mean = scale_translator[category]['mean']
      std = scale_translator[category]['standard_deviation']
      value = (value - mean)/std
      input_t[category] = value
    else:
      print(f'ERROR: Unsupported parameter found! {category}') 

  input_t = np.array([input_t[i] for i in order]).reshape(1, 15)
  model_1 = create_model()
  model_1.load_weights('training_1/cp-0047.ckpt')
  prediction = model_1.predict(input_t).argmax()
  prediction = y_translator[str(prediction)]
  return prediction
  
predict(input)



'21-30'

In [None]:
type(list(X_translator['Hospital_code'].keys())[0])

str