# **Toronto Covid-19 Cases**

## 1. Load Data from PostgreSQL

In [3]:
# Import Dependencies
import pandas as pd

# SQL 
from sqlalchemy import create_engine
import sys
sys.path.append('../')
from config import db_password

# ML Models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [4]:
# Create Connection Between PostgreSQL DB
db_string = f"postgres://postgres:{db_password}@module20covid.cgcfmenzscpu.us-east-2.rds.amazonaws.com:5432/postgres"
db = create_engine(db_string)

In [5]:
# PostgreSQL Query

q = '''
SELECT episode_date, tc.neighbourhood_name, age_group, gender, outcome, ever_hospitalized, ever_in_icu, ever_intubated, population_density, average_income, commute_public_transit, avg_temperature, avg_relative_humidity
FROM "Toronto_Cases" tc
INNER JOIN "Toronto_Stats" ts ON tc.neighbourhood_name = ts.neighbourhood_name
LEFT JOIN (SELECT neighbourhood_name, (commute_car_driver::NUMERIC + commute_car_passenger::NUMERIC) / commute_total::NUMERIC AS "commute_car",
commute_public_transit::NUMERIC / commute_total::NUMERIC AS "commute_public_transit", commute_walk::NUMERIC / commute_total::NUMERIC AS "commute_walk",
commute_bicycle::NUMERIC / commute_total::NUMERIC AS "commute_bicycle", commute_other::NUMERIC / commute_total::NUMERIC AS "commute_other"
FROM "Toronto_Commute"
) commute ON tc.neighbourhood_name = commute.neighbourhood_name
LEFT JOIN "Toronto_Weather" tw ON tc.episode_date = tw.date
'''

In [6]:
# Execute SQL Query and Load Data into DataFrame
toronto_df = pd.read_sql(sql=q, con=db)

## 2. Data Clean Up

In [7]:
# Preview of DataFrame
toronto_df

Unnamed: 0,episode_date,neighbourhood_name,age_group,gender,outcome,ever_hospitalized,ever_in_icu,ever_intubated,population_density,average_income,commute_public_transit,avg_temperature,avg_relative_humidity
0,2020-03-25,Malvern,50-59,MALE,RESOLVED,0,0,0,4948,29573,0.334200,5.65,76.5
1,2020-03-20,Malvern,20-29,MALE,RESOLVED,1,0,0,4948,29573,0.334200,7.04,80.5
2,2020-03-04,Malvern,60-69,FEMALE,RESOLVED,1,1,1,4948,29573,0.334200,3.35,71.5
3,2020-05-02,Rouge,50-59,FEMALE,RESOLVED,0,0,0,1260,39556,0.276047,10.60,63.0
4,2020-05-31,Rouge,30-39,FEMALE,RESOLVED,0,0,0,1260,39556,0.276047,11.45,58.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13069,2020-05-18,West Humber-Clairville,50-59,FEMALE,RESOLVED,0,0,0,1117,31771,0.281220,13.20,79.5
13070,2020-04-12,West Humber-Clairville,30-39,MALE,RESOLVED,0,0,0,1117,31771,0.281220,8.79,62.5
13071,2020-05-12,West Humber-Clairville,20-29,FEMALE,RESOLVED,0,0,0,1117,31771,0.281220,2.40,54.5
13072,2020-05-23,West Humber-Clairville,20-29,FEMALE,RESOLVED,0,0,0,1117,31771,0.281220,18.79,68.5


In [8]:
# Check Data Types
toronto_df.dtypes

episode_date              datetime64[ns]
neighbourhood_name                object
age_group                         object
gender                            object
outcome                           object
ever_hospitalized                  int64
ever_in_icu                        int64
ever_intubated                     int64
population_density                 int64
average_income                     int64
commute_public_transit           float64
avg_temperature                  float64
avg_relative_humidity            float64
dtype: object

In [9]:
# Inspecting for Null Values
for column in toronto_df.columns:
    print(f"Column {column} has {toronto_df[column].isnull().sum()} null values")

Column episode_date has 0 null values
Column neighbourhood_name has 0 null values
Column age_group has 0 null values
Column gender has 0 null values
Column outcome has 0 null values
Column ever_hospitalized has 0 null values
Column ever_in_icu has 0 null values
Column ever_intubated has 0 null values
Column population_density has 0 null values
Column average_income has 0 null values
Column commute_public_transit has 0 null values
Column avg_temperature has 0 null values
Column avg_relative_humidity has 0 null values


In [10]:
# Checking the Correlation between Aggregated Values
toronto_df[['population_density', 'average_income', 'commute_public_transit']].corr()

Unnamed: 0,population_density,average_income,commute_public_transit
population_density,1.0,0.0198,0.501178
average_income,0.0198,1.0,-0.120009
commute_public_transit,0.501178,-0.120009,1.0


In [11]:
# Drop Active Cases
df = toronto_df[toronto_df['outcome'] != 'ACTIVE']
# Replace UNKNOWN and TRANSGENDER by OTHER
Other_Gender = ['UNKNOWN','TRANSGENDER','OTHER']
for gender in Other_Gender:
    df.gender = df.gender.replace(gender, "OTHER")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


## 3 Binary Classification Model for Fatality
#### 3.1 Pre-Processing (Encode, Split & Scaling)

In [12]:
# Creating DataFrame with Outcome & Dependent Variables Required for ML Models
fatality_df = df[['outcome','age_group','gender','population_density','average_income','commute_public_transit']]
fatality_df.head()

Unnamed: 0,outcome,age_group,gender,population_density,average_income,commute_public_transit
0,RESOLVED,50-59,MALE,4948,29573,0.3342
1,RESOLVED,20-29,MALE,4948,29573,0.3342
2,RESOLVED,60-69,FEMALE,4948,29573,0.3342
3,RESOLVED,50-59,FEMALE,1260,39556,0.276047
4,RESOLVED,30-39,FEMALE,1260,39556,0.276047


In [13]:
# Create Categorical Variable List
fatality_df_cat = fatality_df.dtypes[fatality_df.dtypes == "object"].index.tolist()
fatality_df_cat

['outcome', 'age_group', 'gender']

In [14]:
# Create the OneHotEncoder Instance
enc = OneHotEncoder(sparse=False)

# Fit the Encoder and Produce Encoded DataFrame
encoded_df = pd.DataFrame(enc.fit_transform(fatality_df[fatality_df_cat]))

In [15]:
# Rename Encoded Columns
encoded_df.columns = enc.get_feature_names(fatality_df_cat)
encoded_df.head(2)

Unnamed: 0,outcome_FATAL,outcome_RESOLVED,age_group_19 and younger,age_group_20-29,age_group_30-39,age_group_40-49,age_group_50-59,age_group_60-69,age_group_70-79,age_group_80-89,age_group_90+,gender_FEMALE,gender_MALE,gender_OTHER
0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [16]:
fatality_df = fatality_df.merge(encoded_df, left_index=True, right_index=True)
fatality_df = fatality_df.drop(fatality_df_cat,1)
fatality_df

Unnamed: 0,population_density,average_income,commute_public_transit,outcome_FATAL,outcome_RESOLVED,age_group_19 and younger,age_group_20-29,age_group_30-39,age_group_40-49,age_group_50-59,age_group_60-69,age_group_70-79,age_group_80-89,age_group_90+,gender_FEMALE,gender_MALE,gender_OTHER
0,4948,29573,0.334200,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,4948,29573,0.334200,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,4948,29573,0.334200,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1260,39556,0.276047,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1260,39556,0.276047,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12545,1117,31771,0.281220,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
12546,1117,31771,0.281220,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
12547,1117,31771,0.281220,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
12548,1117,31771,0.281220,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [17]:
# Split our preprocessed data into our features and target arrays
y = fatality_df['outcome_RESOLVED'].values
X = fatality_df.drop(['outcome_FATAL','outcome_RESOLVED'],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [18]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

#### 3.2. Building ML Models

In [19]:
# Define the model - deep neural net
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  8
hidden_nodes_layer2 = 4

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))
# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))
# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 8)                 128       
_________________________________________________________________
dense_1 (Dense)              (None, 4)                 36        
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 5         
Total params: 169
Trainable params: 169
Non-trainable params: 0
_________________________________________________________________


In [20]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [21]:
# Train the model
fit_model = nn.fit(X_train,y_train,epochs=50) #epochs (run through the data)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [22]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

3011/3011 - 0s - loss: 0.7954 - acc: 0.9090
Loss: 0.7954486224354018, Accuracy: 0.9090003371238708


In [23]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=196, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.899


In [24]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.19684362, 0.19826183, 0.19758314, 0.00751852, 0.01748069,
       0.01858758, 0.01819738, 0.01597564, 0.01419355, 0.03761977,
       0.10455556, 0.12748147, 0.01948401, 0.02025344, 0.00596378])

In [25]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, fatality_df.drop(['outcome_FATAL','outcome_RESOLVED'],1).columns), reverse=True)

[(0.19826182680528429, 'average_income'),
 (0.19758313685953122, 'commute_public_transit'),
 (0.19684361979267978, 'population_density'),
 (0.12748147137538093, 'age_group_90+'),
 (0.10455556133970516, 'age_group_80-89'),
 (0.03761976794038232, 'age_group_70-79'),
 (0.020253440749363555, 'gender_MALE'),
 (0.019484007581510327, 'gender_FEMALE'),
 (0.018587584082443376, 'age_group_30-39'),
 (0.018197384662472692, 'age_group_40-49'),
 (0.017480694134339936, 'age_group_20-29'),
 (0.01597564330960557, 'age_group_50-59'),
 (0.014193553531757724, 'age_group_60-69'),
 (0.0075185230849598855, 'age_group_19 and younger'),
 (0.005963784750583088, 'gender_OTHER')]

In [26]:
fatality_feature_importance = rf_model.feature_importances_

## 4. Binary Classification Model for Hospitalization
#### 4.1 Pre-Processing (Encode, Split & Scaling)

In [27]:
# Creating DataFrame with Outcome & Dependent Variables Required for ML Models
hospitalized_df = df[['ever_hospitalized','age_group','gender','population_density','average_income','commute_public_transit']]
hospitalized_df.head()

Unnamed: 0,ever_hospitalized,age_group,gender,population_density,average_income,commute_public_transit
0,0,50-59,MALE,4948,29573,0.3342
1,1,20-29,MALE,4948,29573,0.3342
2,1,60-69,FEMALE,4948,29573,0.3342
3,0,50-59,FEMALE,1260,39556,0.276047
4,0,30-39,FEMALE,1260,39556,0.276047


In [28]:
# Create Categorical Variable List
hospitalized_df_cat = hospitalized_df.dtypes[hospitalized_df.dtypes == "object"].index.tolist()
hospitalized_df_cat

['age_group', 'gender']

In [29]:
# Create the OneHotEncoder Instance
enc = OneHotEncoder(sparse=False)

# Fit the Encoder and Produce Encoded DataFrame
encoded_df = pd.DataFrame(enc.fit_transform(hospitalized_df[hospitalized_df_cat]))

In [30]:
# Rename Encoded Columns
encoded_df.columns = enc.get_feature_names(hospitalized_df_cat)
encoded_df.head(2)

Unnamed: 0,age_group_19 and younger,age_group_20-29,age_group_30-39,age_group_40-49,age_group_50-59,age_group_60-69,age_group_70-79,age_group_80-89,age_group_90+,gender_FEMALE,gender_MALE,gender_OTHER
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [31]:
hospitalized_df = hospitalized_df.merge(encoded_df, left_index=True, right_index=True)
hospitalized_df = hospitalized_df.drop(hospitalized_df_cat,1)
hospitalized_df

Unnamed: 0,ever_hospitalized,population_density,average_income,commute_public_transit,age_group_19 and younger,age_group_20-29,age_group_30-39,age_group_40-49,age_group_50-59,age_group_60-69,age_group_70-79,age_group_80-89,age_group_90+,gender_FEMALE,gender_MALE,gender_OTHER
0,0,4948,29573,0.334200,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1,4948,29573,0.334200,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1,4948,29573,0.334200,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0,1260,39556,0.276047,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0,1260,39556,0.276047,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12545,0,1117,31771,0.281220,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
12546,0,1117,31771,0.281220,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
12547,0,1117,31771,0.281220,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
12548,0,1117,31771,0.281220,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [32]:
# Split our preprocessed data into our features and target arrays
y = hospitalized_df['ever_hospitalized'].values
X = hospitalized_df.drop(['ever_hospitalized'],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [33]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

#### 4.2. Building ML Models

In [34]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=196, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.860


In [35]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.24310792, 0.24722154, 0.24373812, 0.01548854, 0.02316191,
       0.02541611, 0.02695675, 0.02457687, 0.02494462, 0.02331429,
       0.02218504, 0.01684818, 0.028726  , 0.0283054 , 0.00600871])

In [36]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, hospitalized_df.drop(['ever_hospitalized'],1).columns), reverse=True)

[(0.24722154413677472, 'average_income'),
 (0.2437381206440662, 'commute_public_transit'),
 (0.24310792132474104, 'population_density'),
 (0.028725995402821867, 'gender_FEMALE'),
 (0.02830539991879378, 'gender_MALE'),
 (0.02695675398088839, 'age_group_40-49'),
 (0.025416107574907757, 'age_group_30-39'),
 (0.024944622031149395, 'age_group_60-69'),
 (0.024576866952763083, 'age_group_50-59'),
 (0.023314293399240307, 'age_group_70-79'),
 (0.02316190599941027, 'age_group_20-29'),
 (0.02218503556938772, 'age_group_80-89'),
 (0.016848182893926044, 'age_group_90+'),
 (0.015488535236256989, 'age_group_19 and younger'),
 (0.006008714934872302, 'gender_OTHER')]

In [37]:
hospitalized_feature_importance = rf_model.feature_importances_

## 5. Findings

In [38]:
result = {'Variables': hospitalized_df.drop(['ever_hospitalized'],1).columns, 
          'Fatality': fatality_feature_importance, 
          'Hospitalized': hospitalized_feature_importance}

In [39]:
pd.DataFrame(result)

Unnamed: 0,Variables,Fatality,Hospitalized
0,population_density,0.196844,0.243108
1,average_income,0.198262,0.247222
2,commute_public_transit,0.197583,0.243738
3,age_group_19 and younger,0.007519,0.015489
4,age_group_20-29,0.017481,0.023162
5,age_group_30-39,0.018588,0.025416
6,age_group_40-49,0.018197,0.026957
7,age_group_50-59,0.015976,0.024577
8,age_group_60-69,0.014194,0.024945
9,age_group_70-79,0.03762,0.023314
