In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import datetime as dt
import statsmodels.api as sm
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.kernel_approximation import RBFSampler
ord_enc = OrdinalEncoder()
%matplotlib inline

# Data Preprocessing 

In [25]:
data = pd.read_csv("C:\\Users\\nidhi19\\OneDrive - York University\\Documents\\Varun\\CIND 820 Project\\COVID19 cases - June 30th.csv")
data = data.dropna().reset_index()
data = data.drop(columns=['index', '_id', 'Assigned_ID'])

In [26]:
data['Episode Date'] = pd.to_datetime(data['Episode Date'])
data['Episode Date'] = (data['Episode Date'] - data['Episode Date'].min()).dt.days

data['Reported Date'] = pd.to_datetime(data['Reported Date'])
data['Reported Date'] = (data['Reported Date'] - data['Reported Date'].min()).dt.days

In [27]:
data.head(5)

Unnamed: 0,Outbreak Associated,Age Group,Neighbourhood Name,FSA,Source of Infection,Classification,Episode Date,Reported Date,Client Gender,Outcome,Currently Hospitalized,Currently in ICU,Currently Intubated,Ever Hospitalized,Ever in ICU,Ever Intubated
0,Sporadic,50 to 59 Years,Willowdale East,M2N,Travel,CONFIRMED,1,0,FEMALE,RESOLVED,No,No,No,No,No,No
1,Sporadic,50 to 59 Years,Willowdale East,M2N,Travel,CONFIRMED,0,0,MALE,RESOLVED,No,No,No,Yes,No,No
2,Sporadic,20 to 29 Years,Parkwoods-Donalda,M3A,Travel,CONFIRMED,15,29,FEMALE,RESOLVED,No,No,No,No,No,No
3,Sporadic,60 to 69 Years,Church-Yonge Corridor,M4W,Travel,CONFIRMED,26,33,FEMALE,RESOLVED,No,No,No,No,No,No
4,Sporadic,60 to 69 Years,Church-Yonge Corridor,M4W,Travel,CONFIRMED,30,34,MALE,RESOLVED,No,No,No,No,No,No


In [28]:
# we have to convert categorical data into numerical values
data_encoded = data.copy()
for col in data.columns:
    data_encoded["Encoded_"+col] = ord_enc.fit_transform(data[[col]])

In [29]:
data_encoded.columns

Index(['Outbreak Associated', 'Age Group', 'Neighbourhood Name', 'FSA',
       'Source of Infection', 'Classification', 'Episode Date',
       'Reported Date', 'Client Gender', 'Outcome', 'Currently Hospitalized',
       'Currently in ICU', 'Currently Intubated', 'Ever Hospitalized',
       'Ever in ICU', 'Ever Intubated', 'Encoded_Outbreak Associated',
       'Encoded_Age Group', 'Encoded_Neighbourhood Name', 'Encoded_FSA',
       'Encoded_Source of Infection', 'Encoded_Classification',
       'Encoded_Episode Date', 'Encoded_Reported Date',
       'Encoded_Client Gender', 'Encoded_Outcome',
       'Encoded_Currently Hospitalized', 'Encoded_Currently in ICU',
       'Encoded_Currently Intubated', 'Encoded_Ever Hospitalized',
       'Encoded_Ever in ICU', 'Encoded_Ever Intubated'],
      dtype='object')

In [30]:
feats = ['Encoded_Outbreak Associated',
       'Encoded_Age Group', 'Encoded_Neighbourhood Name', 'Encoded_FSA',
       'Encoded_Source of Infection', 'Encoded_Classification',
       'Encoded_Episode Date', 'Encoded_Reported Date',
       'Encoded_Client Gender', 'Encoded_Outcome',
       'Encoded_Currently Hospitalized', 'Encoded_Currently in ICU',
       'Encoded_Currently Intubated']

dependent_variable = 'Encoded_Ever Hospitalized'

In [32]:
X = data_encoded[feats]
Y = data_encoded[dependent_variable]

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

# Logistic Regression

In [34]:
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [35]:
y_pred = model.predict(X_test)

In [36]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.94      0.99      0.97     51502
         1.0       0.61      0.16      0.25      3596

    accuracy                           0.94     55098
   macro avg       0.78      0.57      0.61     55098
weighted avg       0.92      0.94      0.92     55098



# Random Forest 

In [37]:
model = RandomForestClassifier()
model.fit(X_train, y_train)

RandomForestClassifier()

In [38]:
y_pred = model.predict(X_test)

In [39]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.95      0.99      0.97     51502
         1.0       0.64      0.26      0.36      3596

    accuracy                           0.94     55098
   macro avg       0.80      0.62      0.67     55098
weighted avg       0.93      0.94      0.93     55098



# Stochastic Gradient Descent Classifier

In [40]:
model = SGDClassifier()
model.fit(X_train, y_train)

SGDClassifier()

In [41]:
y_pred = model.predict(X_test)

In [42]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.94      0.99      0.97     51502
         1.0       0.47      0.07      0.12      3596

    accuracy                           0.93     55098
   macro avg       0.71      0.53      0.54     55098
weighted avg       0.91      0.93      0.91     55098



# SGD Classifier with kernel approximation


In [43]:
rbf_feature = RBFSampler(gamma=1, random_state=42)
X_features = rbf_feature.fit_transform(X_train)
model = SGDClassifier(max_iter=5)
model.fit(X_features, y_train)



SGDClassifier(max_iter=5)

In [44]:
y_pred = model.predict(rbf_feature.fit_transform(X_test))

In [45]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.93      1.00      0.97     51502
         1.0       0.00      0.00      0.00      3596

    accuracy                           0.93     55098
   macro avg       0.47      0.50      0.48     55098
weighted avg       0.87      0.93      0.90     55098



  _warn_prf(average, modifier, msg_start, len(result))
