# **Toronto Covid-19 Cases**

#### 1. Load Data from PostgreSQL

In [1]:
# Import dependencies
import pandas as pd
from sqlalchemy import create_engine
import sys
sys.path.append('../')
from config import db_password

In [3]:
# Create Connection Between PostgreSQL DB
db_string = f"postgres://postgres:{db_password}@module20covid.cgcfmenzscpu.us-east-2.rds.amazonaws.com:5432/postgres"
db = create_engine(db_string)

In [4]:
# PostgreSQL Query

q = '''
SELECT episode_date, tc.neighbourhood_name, age_group, gender, outcome, ever_hospitalized, ever_in_icu, ever_intubated, population_density, average_income, commute_public_transit, avg_temperature, avg_relative_humidity
FROM "Toronto_Cases" tc
INNER JOIN "Toronto_Stats" ts ON tc.neighbourhood_name = ts.neighbourhood_name
LEFT JOIN (SELECT neighbourhood_name, (commute_car_driver::NUMERIC + commute_car_passenger::NUMERIC) / commute_total::NUMERIC AS "commute_car",
commute_public_transit::NUMERIC / commute_total::NUMERIC AS "commute_public_transit", commute_walk::NUMERIC / commute_total::NUMERIC AS "commute_walk",
commute_bicycle::NUMERIC / commute_total::NUMERIC AS "commute_bicycle", commute_other::NUMERIC / commute_total::NUMERIC AS "commute_other"
FROM "Toronto_Commute"
) commute ON tc.neighbourhood_name = commute.neighbourhood_name
LEFT JOIN "Toronto_Weather" tw ON tc.episode_date = tw.date
'''

In [6]:
# Execute SQL Query and Load Data into DataFrame
toronto_df = pd.read_sql(sql=q, con=db)

In [7]:
toronto_df

Unnamed: 0,episode_date,neighbourhood_name,age_group,gender,outcome,ever_hospitalized,ever_in_icu,ever_intubated,population_density,average_income,commute_public_transit,avg_temperature,avg_relative_humidity
0,2020-03-25,Malvern,50-59,MALE,RESOLVED,0,0,0,4948,29573,0.334200,5.65,76.5
1,2020-03-20,Malvern,20-29,MALE,RESOLVED,1,0,0,4948,29573,0.334200,7.04,80.5
2,2020-03-04,Malvern,60-69,FEMALE,RESOLVED,1,1,1,4948,29573,0.334200,3.35,71.5
3,2020-05-02,Rouge,50-59,FEMALE,RESOLVED,0,0,0,1260,39556,0.276047,10.60,63.0
4,2020-05-31,Rouge,30-39,FEMALE,RESOLVED,0,0,0,1260,39556,0.276047,11.45,58.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13069,2020-05-18,West Humber-Clairville,50-59,FEMALE,RESOLVED,0,0,0,1117,31771,0.281220,13.20,79.5
13070,2020-04-12,West Humber-Clairville,30-39,MALE,RESOLVED,0,0,0,1117,31771,0.281220,8.79,62.5
13071,2020-05-12,West Humber-Clairville,20-29,FEMALE,RESOLVED,0,0,0,1117,31771,0.281220,2.40,54.5
13072,2020-05-23,West Humber-Clairville,20-29,FEMALE,RESOLVED,0,0,0,1117,31771,0.281220,18.79,68.5


#### Pre-Processing

In [17]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf

In [9]:
toronto_df.dtypes

episode_date              datetime64[ns]
neighbourhood_name                object
age_group                         object
gender                            object
outcome                           object
ever_hospitalized                  int64
ever_in_icu                        int64
ever_intubated                     int64
population_density                 int64
average_income                     int64
commute_public_transit           float64
avg_temperature                  float64
avg_relative_humidity            float64
dtype: object

In [63]:
# Creating DataFrame with Outcome & Dependent Variables Required for ML Models
df = toronto_df[['outcome','age_group','gender','population_density','average_income','commute_public_transit']]
# Drop Active Cases and Unknown Gender
df = df[(df['outcome'] != 'ACTIVE') & (df['gender'] != 'UNKNOWN')]
df

Unnamed: 0,outcome,age_group,gender,population_density,average_income
0,RESOLVED,50-59,MALE,4948,29573
1,RESOLVED,20-29,MALE,4948,29573
2,RESOLVED,60-69,FEMALE,4948,29573
3,RESOLVED,50-59,FEMALE,1260,39556
4,RESOLVED,30-39,FEMALE,1260,39556
...,...,...,...,...,...
13069,RESOLVED,50-59,FEMALE,1117,31771
13070,RESOLVED,30-39,MALE,1117,31771
13071,RESOLVED,20-29,FEMALE,1117,31771
13072,RESOLVED,20-29,FEMALE,1117,31771


In [64]:
# Inspecting for Null Values
for column in df.columns:
    print(f"Column {column} has {df[column].isnull().sum()} null values")

Column outcome has 0 null values
Column age_group has 0 null values
Column gender has 0 null values
Column population_density has 0 null values
Column average_income has 0 null values


In [65]:
# Create Categorical Variable List
df_cat = df.dtypes[df.dtypes == "object"].index.tolist()
df_cat

['outcome', 'age_group', 'gender']

In [66]:
# Create the OneHotEncoder Instance
enc = OneHotEncoder(sparse=False)

# Fit the Encoder and Produce Encoded DataFrame
encoded_df = pd.DataFrame(enc.fit_transform(df[df_cat]))

In [67]:
# Rename Encoded Columns
encoded_df.columns = enc.get_feature_names(df_cat)
encoded_df

Unnamed: 0,outcome_FATAL,outcome_RESOLVED,age_group_19 and younger,age_group_20-29,age_group_30-39,age_group_40-49,age_group_50-59,age_group_60-69,age_group_70-79,age_group_80-89,age_group_90+,gender_FEMALE,gender_MALE,gender_OTHER,gender_TRANSGENDER
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12349,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
12350,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
12351,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
12352,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [68]:
df = df.merge(encoded_df, left_index=True, right_index=True)
df = df.drop(df_cat,1)
df

Unnamed: 0,population_density,average_income,outcome_FATAL,outcome_RESOLVED,age_group_19 and younger,age_group_20-29,age_group_30-39,age_group_40-49,age_group_50-59,age_group_60-69,age_group_70-79,age_group_80-89,age_group_90+,gender_FEMALE,gender_MALE,gender_OTHER,gender_TRANSGENDER
0,4948,29573,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,4948,29573,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,4948,29573,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1260,39556,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,1260,39556,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12349,7291,26548,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
12350,7291,26548,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
12351,7291,26548,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
12352,7291,26548,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [69]:
# Split our preprocessed data into our features and target arrays
y = df['outcome_RESOLVED'].values
X = df.drop(['outcome_FATAL','outcome_RESOLVED'],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [70]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [71]:
# Define the model - deep neural net
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  8
hidden_nodes_layer2 = 6
hidden_nodes_layer3 = 2

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))
# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))
# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation="relu"))
# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_10 (Dense)             (None, 8)                 128       
_________________________________________________________________
dense_11 (Dense)             (None, 6)                 54        
_________________________________________________________________
dense_12 (Dense)             (None, 2)                 14        
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 3         
Total params: 199
Trainable params: 199
Non-trainable params: 0
_________________________________________________________________


In [72]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [73]:
# Train the model
fit_model = nn.fit(X_train,y_train,epochs=50) #epochs (run through the data)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [74]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

2916/2916 - 0s - loss: 0.2762 - acc: 0.9211
Loss: 0.27616893129093656, Accuracy: 0.9211248159408569


In [59]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [60]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=196, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.907
