# **Toronto Covid-19 Cases**

## 1. Load Data from PostgreSQL

In [2]:
# Import Dependencies
import pandas as pd

# SQL 
from sqlalchemy import create_engine
import sys
sys.path.append('../')
from config import db_password

# ML Models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [3]:
# Create Connection Between PostgreSQL DB
db_string = f"postgres://postgres:{db_password}@module20covid.cgcfmenzscpu.us-east-2.rds.amazonaws.com:5432/postgres"
db = create_engine(db_string)

In [4]:
# PostgreSQL Query

q = '''
SELECT episode_date, tc.neighbourhood_name, age_group, gender, outcome, ever_hospitalized, ever_in_icu, ever_intubated, population_density, average_income, commute_public_transit, avg_temperature, avg_relative_humidity
FROM "Toronto_Cases" tc
INNER JOIN "Toronto_Stats" ts ON tc.neighbourhood_name = ts.neighbourhood_name
LEFT JOIN (SELECT neighbourhood_name, (commute_car_driver::NUMERIC + commute_car_passenger::NUMERIC) / commute_total::NUMERIC AS "commute_car",
commute_public_transit::NUMERIC / commute_total::NUMERIC AS "commute_public_transit", commute_walk::NUMERIC / commute_total::NUMERIC AS "commute_walk",
commute_bicycle::NUMERIC / commute_total::NUMERIC AS "commute_bicycle", commute_other::NUMERIC / commute_total::NUMERIC AS "commute_other"
FROM "Toronto_Commute"
) commute ON tc.neighbourhood_name = commute.neighbourhood_name
LEFT JOIN "Toronto_Weather" tw ON tc.episode_date = tw.date
'''

In [5]:
# Execute SQL Query and Load Data into DataFrame
toronto_df = pd.read_sql(sql=q, con=db)

## 2. Data Clean Up

In [6]:
# Preview of DataFrame
toronto_df

Unnamed: 0,episode_date,neighbourhood_name,age_group,gender,outcome,ever_hospitalized,ever_in_icu,ever_intubated,population_density,average_income,commute_public_transit,avg_temperature,avg_relative_humidity
0,2020-03-25,Malvern,50-59,MALE,RESOLVED,0,0,0,4948,29573,0.334200,5.65,76.5
1,2020-03-20,Malvern,20-29,MALE,RESOLVED,1,0,0,4948,29573,0.334200,7.04,80.5
2,2020-03-04,Malvern,60-69,FEMALE,RESOLVED,1,1,1,4948,29573,0.334200,3.35,71.5
3,2020-05-02,Rouge,50-59,FEMALE,RESOLVED,0,0,0,1260,39556,0.276047,10.60,63.0
4,2020-05-31,Rouge,30-39,FEMALE,RESOLVED,0,0,0,1260,39556,0.276047,11.45,58.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13069,2020-05-18,West Humber-Clairville,50-59,FEMALE,RESOLVED,0,0,0,1117,31771,0.281220,13.20,79.5
13070,2020-04-12,West Humber-Clairville,30-39,MALE,RESOLVED,0,0,0,1117,31771,0.281220,8.79,62.5
13071,2020-05-12,West Humber-Clairville,20-29,FEMALE,RESOLVED,0,0,0,1117,31771,0.281220,2.40,54.5
13072,2020-05-23,West Humber-Clairville,20-29,FEMALE,RESOLVED,0,0,0,1117,31771,0.281220,18.79,68.5


In [7]:
# Check Data Types
toronto_df.dtypes

episode_date              datetime64[ns]
neighbourhood_name                object
age_group                         object
gender                            object
outcome                           object
ever_hospitalized                  int64
ever_in_icu                        int64
ever_intubated                     int64
population_density                 int64
average_income                     int64
commute_public_transit           float64
avg_temperature                  float64
avg_relative_humidity            float64
dtype: object

In [8]:
# Inspecting for Null Values
for column in toronto_df.columns:
    print(f"Column {column} has {toronto_df[column].isnull().sum()} null values")

SyntaxError: invalid syntax (<ipython-input-8-3dd7069b3965>, line 3)

In [None]:
# Checking the Correlation between Aggregated Values
toronto_df[['population_density', 'average_income', 'commute_public_transit']].corr()

In [None]:
# Drop Active Cases
df = toronto_df[toronto_df['outcome'] != 'ACTIVE']
# Replace UNKNOWN and TRANSGENDER by OTHER
Other_Gender = ['UNKNOWN','TRANSGENDER','OTHER']
for gender in Other_Gender:
    df.gender = df.gender.replace(gender, "OTHER")

## 3 Binary Classification Model for Fatality
#### 3.1 Pre-Processing (Encode, Split & Scaling)

In [None]:
# Creating DataFrame with Outcome & Dependent Variables Required for ML Models
fatality_df = df[['outcome','age_group','gender','population_density','average_income','commute_public_transit']]
fatality_df.head()

In [None]:
# Create Categorical Variable List
fatality_df_cat = fatality_df.dtypes[fatality_df.dtypes == "object"].index.tolist()
fatality_df_cat

In [None]:
# Create the OneHotEncoder Instance
enc = OneHotEncoder(sparse=False)

# Fit the Encoder and Produce Encoded DataFrame
encoded_df = pd.DataFrame(enc.fit_transform(fatality_df[fatality_df_cat]))

In [None]:
# Rename Encoded Columns
encoded_df.columns = enc.get_feature_names(fatality_df_cat)
encoded_df.head(2)

In [None]:
fatality_df = fatality_df.merge(encoded_df, left_index=True, right_index=True)
fatality_df = fatality_df.drop(fatality_df_cat,1)
fatality_df

In [None]:
# Split our preprocessed data into our features and target arrays
y = fatality_df['outcome_RESOLVED'].values
X = fatality_df.drop(['outcome_FATAL','outcome_RESOLVED'],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

#### 3.2. Building ML Models

In [None]:
# Define the model - deep neural net
number_input_features = len(X_train[0])
hidden_nodes_layer1 =  8
hidden_nodes_layer2 = 4

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))
# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))
# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

In [None]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=50) #epochs (run through the data)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

In [None]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=196, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

In [None]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

In [None]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, fatality_df.drop(['outcome_FATAL','outcome_RESOLVED'],1).columns), reverse=True)

In [None]:
fatality_feature_importance = rf_model.feature_importances_

## 4. Binary Classification Model for Hospitalization
#### 4.1 Pre-Processing (Encode, Split & Scaling)

In [None]:
# Creating DataFrame with Outcome & Dependent Variables Required for ML Models
hospitalized_df = df[['ever_hospitalized','age_group','gender','population_density','average_income','commute_public_transit']]
hospitalized_df.head()

In [None]:
# Create Categorical Variable List
hospitalized_df_cat = hospitalized_df.dtypes[hospitalized_df.dtypes == "object"].index.tolist()
hospitalized_df_cat

In [None]:
# Create the OneHotEncoder Instance
enc = OneHotEncoder(sparse=False)

# Fit the Encoder and Produce Encoded DataFrame
encoded_df = pd.DataFrame(enc.fit_transform(hospitalized_df[hospitalized_df_cat]))

In [None]:
# Rename Encoded Columns
encoded_df.columns = enc.get_feature_names(hospitalized_df_cat)
encoded_df.head(2)

In [None]:
hospitalized_df = hospitalized_df.merge(encoded_df, left_index=True, right_index=True)
hospitalized_df = hospitalized_df.drop(hospitalized_df_cat,1)
hospitalized_df

In [None]:
# Split our preprocessed data into our features and target arrays
y = hospitalized_df['ever_hospitalized'].values
X = hospitalized_df.drop(['ever_hospitalized'],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

#### 4.2. Building ML Models

In [None]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=196, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

In [None]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

In [None]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, hospitalized_df.drop(['ever_hospitalized'],1).columns), reverse=True)

In [None]:
hospitalized_feature_importance = rf_model.feature_importances_

## 5. Findings

In [None]:
result = {'Variables': hospitalized_df.drop(['ever_hospitalized'],1).columns, 
          'Fatality': fatality_feature_importance, 
          'Hospitalized': hospitalized_feature_importance}

In [None]:
pd.DataFrame(result)