In [1]:
import pandas as pd
import numpy as np
raw = pd.read_csv('TX_2021_monthly.csv')
#raw.head()

In [2]:
# number of data points
print('number of data points:', raw.shape[0])
# number of counties
print('number of counties:', raw['county'].nunique())

number of data points: 3048
number of counties: 254


In [3]:
# replace 'Suppressed' with 5
raw['annual_all_deaths'] = raw['annual_all_deaths'].replace('Suppressed', 5)
raw['annual_hospital_deaths'] = raw['annual_hospital_deaths'].replace('Suppressed', 5)
raw['monthly_hospital_deaths'] = raw['monthly_hospital_deaths'].replace('Suppressed', 5)
raw['monthly_covid_hospital_deaths'] = raw['monthly_covid_hospital_deaths'].replace('Suppressed', 5)
#raw.head()

In [4]:
# convert death number from string to integer
#print('data type of each feature before converting:\n',raw.dtypes)
raw[['annual_all_deaths', 'annual_hospital_deaths', 'monthly_hospital_deaths', 'monthly_covid_hospital_deaths']] = raw[['annual_all_deaths', 'annual_hospital_deaths', 'monthly_hospital_deaths', 'monthly_covid_hospital_deaths']].astype(int)
#print('data type of each feature after converting:\n',raw.dtypes)

In [5]:
# calculate the number of monthly hosptial death without covid
raw['monthly_noncovid_hospital_deaths'] = raw['monthly_hospital_deaths'] - raw['monthly_covid_hospital_deaths']
#raw.head()

In [6]:
# use onehot encoding to encode the feature 'county'
from sklearn.preprocessing import OneHotEncoder
onehot = OneHotEncoder(sparse_output = False)
county_encoded = onehot.fit_transform(raw[['county']])
print(county_encoded.shape)

(3048, 254)


In [7]:
# given the name of a county, find its onehot vector representation
def get_county_encoding(county_name):
    county_list = raw['county'].tolist()
    if county_name in county_list:
        county_index = county_list.index(county_name)
        county_encoding = county_encoded[county_index]
        return county_encoding
    else:
        print('County not found.')
        return None

In [8]:
# scale the monthly_noncovid_hospital_deaths before feeding it into LSTM
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
raw['monthly_noncovid_hospital_deaths_scaled'] = scaler.fit_transform(raw[['monthly_noncovid_hospital_deaths']])
#raw.head()

In [9]:
# split the 'raw' data into training set (Jan to Nov) and testing set (Dec)
raw_train = raw.loc[raw['month'] != 'Dec',:]
raw_test = raw.loc[raw['month'].isin(['Sep', 'Oct', 'Nov', 'Dec']),:]

In [10]:
# convert the 'monthly_noncovid_hospital_deaths_scaled' into [rows, columns] structure
death_train = np.array(raw_train['monthly_noncovid_hospital_deaths_scaled'])
death_train = death_train.reshape(len(death_train),1)
print('death_train.shape', death_train.shape)
death_test = np.array(raw_test['monthly_noncovid_hospital_deaths_scaled'])
death_test = death_test.reshape(len(death_test),1)
print('death_test.shape', death_test.shape)

death_train.shape (2794, 1)
death_test.shape (1016, 1)


In [11]:
# create the county feature for training set and testing set
county_train = raw_train['county']
county_train_encoded = county_train.map(get_county_encoding)
county_train_encoded = np.vstack(county_train_encoded)
print('county_train_encoded.shape:', county_train_encoded.shape)
county_test = raw_test['county']
county_test_encoded = county_test.map(get_county_encoding)
county_test_encoded = np.vstack(county_test_encoded)
print('county_test_encoded.shape:', county_test_encoded.shape)

county_train_encoded.shape: (2794, 254)
county_test_encoded.shape: (1016, 254)


In [12]:
# horizontally stack the county feature and the death feature
data_train = np.hstack((county_train_encoded, death_train))
print('data_train.shape', data_train.shape)
data_test = np.hstack((county_test_encoded, death_test))
print('data_test.shape', data_test.shape)

data_train.shape (2794, 255)
data_test.shape (1016, 255)


In [13]:
# split a multivariate sequence into samples
def split_sequence(sequence, n_timestep): # n_timestep is the window size
    X, y = list(), list()
    for i in range(len(sequence)):
        end_ix = i+ n_timestep # find the end of the pattern
        if end_ix > len(sequence)-1: # check if we are beyond the dataset
            break
        seq_x = sequence[i:end_ix, :]
        seq_y = sequence[end_ix, -1]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

In [14]:
# given a two dimensional array of each county and death number, calcaute a sequence for each county and concatenate them together
def get_input_sequence(array, n_timestep):
    X_list, y_list = list(), list() #empty list to hold the sequence of each county
    for i in range(raw['county'].nunique()):
        n_obs_each_county = int(len(array)/raw['county'].nunique()) # number of observations for each county
        begin_index = n_obs_each_county*i
        end_index = begin_index + n_obs_each_county
        subset_array = array[begin_index:end_index,] # split the array into each county to create sequence for each county
        X_county, y_county = split_sequence(subset_array, n_timestep)
        X_list.append(X_county)
        y_list.append(y_county)
    # vertically stack X_county
    X = np.vstack(X_list)
    # concatenate y_county
    y = np.concatenate(y_list)
    return X, y
        

In [27]:
#If X_train has shape (num_samples, n_timestep, n_features), 
#then X_test should also have shape (num_test_samples, n_timestep, n_features)**
X_train, y_train = get_input_sequence(array = data_train, n_timestep = 3)
print('X_train.shape:',X_train.shape)
print('y_train.shape:',y_train.shape)
X_test, y_test = get_input_sequence(array = data_test, n_timestep = 3)
print('X_test.shape:', X_test.shape)
print('y_test.shape:', y_test.shape)

X_train.shape: (2032, 3, 255)
y_train.shape: (2032,)
X_test.shape: (254, 3, 255)
y_test.shape: (254,)


In [22]:
n_timestep = X_train.shape[1]
n_features = X_train.shape[2]

In [23]:
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
import tensorflow as tf
import random

In [24]:
# fix the seed so that we can replicate the results
random.seed(9999)
np.random.seed(9999)
tf.random.set_seed(9999)

In [25]:
# define a model with one LSTM layer and one dense layer
model_lstm = Sequential()
model_lstm.add(LSTM(50, activation = 'relu', input_shape = (n_timestep, n_features)))
model_lstm.add(Dense(1,))
model_lstm.compile(optimizer='adam', loss='mse')

In [26]:
# fit the model
model_lstm.fit(X_train, y_train, epochs = 200, verbose = False)
model_lstm.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 50)                61200     
                                                                 
 dense (Dense)               (None, 1)                 51        
                                                                 
Total params: 61251 (239.26 KB)
Trainable params: 61251 (239.26 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [28]:
# calcuate the MSE for training set and testing set
mse_train = model_lstm.evaluate(X_train, y_train)
print('mse_train:', mse_train)
mse_test = model_lstm.evaluate(X_test, y_test)
print('mse_test:', mse_test)

mse_train: 3.605739402701147e-05
mse_test: 4.249086123309098e-05


In [52]:
# predict the death rate and convert it back to the orignal scaley
y_predicted = model_lstm.predict(X_test)
y_predicted = scaler.inverse_transform(y_predicted)
y_predicted = y_predicted.reshape(y_predicted.shape[0])
print('y_predicted.shape', y_predicted.shape)

y_predicted.shape (254,)


In [53]:
# extract the true value of the death number
y_true = raw.loc[raw['month'] == 'Dec',:]['monthly_noncovid_hospital_deaths'].values
print('y_ture.shape', y_true.shape)

y_ture.shape (254,)


In [56]:
# calculate the MSE at orignal scale
from sklearn.metrics import mean_squared_error
mse_test_original_scale = mean_squared_error(y_true, y_predicted)
print('MSE of the testing data at orignal scale =', mse_test_original_scale)

MSE of the testing data at orignal scale = 97.14016770202026
