In [1]:
# Dependencies and Setup
import requests
import time
import pandas as pd
import numpy as np
from pprint import pprint
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.ticker as ticker
import sklearn.datasets as dta
import scipy.stats as st
from scipy.stats import sem
from scipy.stats import linregress
import seaborn as sns

In [22]:
# Set the base URL
covid_url = 'https://api.opencovid.ca/timeseries?geo=pt&fill=false&version=true&pt_names=canonical&hr_names=hruid&fmt=json'
covid_confirmed_cases_response = requests.get(covid_url).json()
# Define an empty list to fetch the covid confirmed cases
covid_cases_data = []
# Print to logger
print("Beginning Data Retrieval     ")
for case in covid_confirmed_cases_response['data']['cases']:
    covid_cases_data.append({"Province": case['region'], "Date" : case['date'], "Cumulative confirmed cases": case['value'], "Confirmed cases per day": case['value_daily']})            
# Indicate that Data Loading is complete 
print("Data Retrieval Complete      ")

Beginning Data Retrieval     
Data Retrieval Complete      


In [23]:
covid_confirmed_cases_data_df = pd.DataFrame(covid_cases_data)
# Show Record Count
covid_confirmed_cases_data_df

Unnamed: 0,Province,Date,Cumulative confirmed cases,Confirmed cases per day
0,Alberta,2020-03-05,1,1
1,Alberta,2020-03-06,1,0
2,Alberta,2020-03-07,2,1
3,Alberta,2020-03-08,3,1
4,Alberta,2020-03-09,7,4
...,...,...,...,...
14390,Yukon,2022-11-06,5578,5
14391,Yukon,2022-11-07,5581,3
14392,Yukon,2022-11-08,5582,1
14393,Yukon,2022-11-09,5582,0


In [24]:
# Set the API base URL
covid_url = 'https://api.opencovid.ca/timeseries?geo=pt&fill=false&version=true&pt_names=canonical&hr_names=hruid&fmt=json'
covid_deaths_response = requests.get(covid_url).json()
# Define an empty list to fetch the death data
covid_death_data = []
# Print to logger
print("Beginning Data Retrieval     ")
for case in covid_deaths_response['data']['deaths']:
    covid_death_data.append({"Province": case['region'], "Date" : case['date'], "Cumulative death cases": case['value'], "Deaths per day": case['value_daily']})            
# Indicate that Data Loading is complete 
print("Data Retrieval Complete      ")

Beginning Data Retrieval     
Data Retrieval Complete      


In [25]:
death_data_df = pd.DataFrame(covid_death_data)
death_data_df

Unnamed: 0,Province,Date,Cumulative death cases,Deaths per day
0,Alberta,2020-03-08,0,0
1,Alberta,2020-03-09,0,0
2,Alberta,2020-03-10,0,0
3,Alberta,2020-03-11,0,0
4,Alberta,2020-03-12,0,0
...,...,...,...,...
14292,Yukon,2022-11-01,32,0
14293,Yukon,2022-11-02,32,0
14294,Yukon,2022-11-03,32,0
14295,Yukon,2022-11-04,32,0


In [26]:
# Set the API base URL
covid_url = 'https://api.opencovid.ca/timeseries?geo=pt&fill=false&version=true&pt_names=canonical&hr_names=hruid&fmt=json'
covid_hospitalization_response = requests.get(covid_url).json()
# Define an empty list to fetch the death data
hospitalization_data = []
# Print to logger
print("Beginning Data Retrieval     ")
for case in covid_hospitalization_response['data']['hospitalizations']:
    hospitalization_data.append({"Province": case['region'], "Date" : case['date'], "Cumulative hospitalizations cases": case['value'], "hospitalizations per day": case['value_daily']})            
# Indicate that Data Loading is complete 
print("Data Retrieval Complete      ")

Beginning Data Retrieval     
Data Retrieval Complete      


In [27]:
hospitalization_data_df = pd.DataFrame(hospitalization_data)
hospitalization_data_df

Unnamed: 0,Province,Date,Cumulative hospitalizations cases,hospitalizations per day
0,Alberta,2020-01-25,0,0
1,Alberta,2020-01-26,0,0
2,Alberta,2020-01-27,0,0
3,Alberta,2020-01-28,0,0
4,Alberta,2020-01-29,0,0
...,...,...,...,...
13999,Yukon,2022-11-13,0,0
14000,Yukon,2022-11-14,0,0
14001,Yukon,2022-11-15,0,0
14002,Yukon,2022-11-16,0,0


In [28]:
# Set the base URL
covid_url = 'https://api.opencovid.ca/timeseries?geo=pt&fill=false&version=true&pt_names=canonical&hr_names=hruid&fmt=json'
covid_vaccinationdose1_coverage_response = requests.get(covid_url).json()
# Define an empty list to fetch the covid confirmed cases
covid_vaccinationdose1_coverage_data = []
# Print to logger
print("Beginning Data Retrieval     ")
for case in covid_vaccinationdose1_coverage_response['data']['vaccine_coverage_dose_1']:
    covid_vaccinationdose1_coverage_data.append({"Province": case['region'], "Date" : case['date'], "Cumulative vaccine coverage dose1": case['value'], "Vaccine coverage dose1 per day": case['value_daily']})            
# Indicate that Data Loading is complete 
print("Data Retrieval Complete      ")

Beginning Data Retrieval     
Data Retrieval Complete      


In [30]:
covid_vaccinationdose1_coverage_data_df = pd.DataFrame(covid_vaccinationdose1_coverage_data)
# Show Record Count
covid_vaccinationdose1_coverage_data_df.head(3)

Unnamed: 0,Province,Date,Cumulative vaccine coverage dose1,Vaccine coverage dose1 per day
0,Alberta,2020-12-19,0.0,0.0
1,Alberta,2020-12-20,0.0,0.0
2,Alberta,2020-12-21,0.0,0.0


In [31]:
# Mering the three DataFrames named as covid_confirmed_cases_data_df, death_data_df and hospitalization_data_df on columns Province and Date as they are common columns in all of the dataframes
# covid_combined_df = pd.merge(covid_confirmed_cases_data_df, pd.merge(death_data_df, hospitalization_data_df, on=['Province', 'Date']), on=['Province', 'Date'])
# covid_combined_df
covid_combined_df = covid_confirmed_cases_data_df.merge(death_data_df, on=['Province', 'Date']).merge(hospitalization_data_df, on=['Province', 'Date']).merge(covid_vaccinationdose1_coverage_data_df, on=['Province', 'Date'])
covid_combined_df.describe()

Unnamed: 0,Cumulative confirmed cases,Confirmed cases per day,Cumulative death cases,Deaths per day,Cumulative hospitalizations cases,hospitalizations per day,Cumulative vaccine coverage dose1,Vaccine coverage dose1 per day
count,9266.0,9266.0,9266.0,9266.0,9266.0,9266.0,9266.0,9266.0
mean,220652.9,422.826139,2750.898878,3.817721,306.92316,-0.005936,65.462174,0.119264
std,354936.0,1155.429091,4287.182875,9.558755,504.427551,22.991297,28.684811,0.684302
min,24.0,-4.0,0.0,-12.0,0.0,-314.0,0.0,-9.1
25%,2435.25,0.0,21.0,0.0,0.0,0.0,60.275,0.0
50%,55812.0,58.0,571.5,0.0,69.0,0.0,79.0,0.0
75%,309262.8,409.0,3300.0,3.0,362.75,0.0,83.9,0.0
max,1614845.0,19509.0,16485.0,112.0,4190.0,272.0,95.7,17.5


In [32]:
covid_combined_final_df = covid_combined_df[(covid_combined_df['Cumulative confirmed cases'] >= 0) & (covid_combined_df['Confirmed cases per day'] >= 0) & (covid_combined_df['Cumulative death cases'] >= 0) & (covid_combined_df['Deaths per day'] >= 0) & (covid_combined_df['Cumulative hospitalizations cases'] >= 0)& (covid_combined_df['hospitalizations per day'] >= 0)& (covid_combined_df['Cumulative vaccine coverage dose1'] >= 0) & (covid_combined_df['Vaccine coverage dose1 per day'] >= 0)]
covid_combined_final_df = covid_combined_final_df.dropna()
covid_combined_final_df.describe()

Unnamed: 0,Cumulative confirmed cases,Confirmed cases per day,Cumulative death cases,Deaths per day,Cumulative hospitalizations cases,hospitalizations per day,Cumulative vaccine coverage dose1,Vaccine coverage dose1 per day
count,7358.0,7358.0,7358.0,7358.0,7358.0,7358.0,7358.0,7358.0
mean,172665.1,358.907855,2037.549606,2.78717,241.35458,4.938298,66.93153,0.115072
std,301340.4,1188.076589,3579.853194,8.635528,432.91207,18.411284,27.925885,0.6486
min,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1649.0,0.0,12.0,0.0,0.0,0.0,63.3,0.0
50%,47123.0,16.0,258.0,0.0,33.0,0.0,79.5,0.0
75%,155106.2,232.75,2280.25,1.0,292.0,0.0,84.0,0.0
max,1614711.0,19509.0,16483.0,112.0,4190.0,272.0,95.7,17.5


In [33]:
covid_combined_final_df['Mortality rate'] = round(((covid_combined_final_df['Cumulative death cases'] / covid_combined_final_df['Cumulative confirmed cases']) * 100), 3)
covid_combined_final_df['Daily mortality rate'] = round(((covid_combined_final_df['Deaths per day'] / covid_combined_final_df['Confirmed cases per day']) * 100), 3)
covid_combined_final_df.describe()

Unnamed: 0,Cumulative confirmed cases,Confirmed cases per day,Cumulative death cases,Deaths per day,Cumulative hospitalizations cases,hospitalizations per day,Cumulative vaccine coverage dose1,Vaccine coverage dose1 per day,Mortality rate,Daily mortality rate
count,7358.0,7358.0,7358.0,7358.0,7358.0,7358.0,7358.0,7358.0,7358.0,4882.0
mean,172665.1,358.907855,2037.549606,2.78717,241.35458,4.938298,66.93153,0.115072,0.981959,inf
std,301340.4,1188.076589,3579.853194,8.635528,432.91207,18.411284,27.925885,0.6486,0.755054,
min,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1649.0,0.0,12.0,0.0,0.0,0.0,63.3,0.0,0.512,0.0
50%,47123.0,16.0,258.0,0.0,33.0,0.0,79.5,0.0,0.8945,0.0
75%,155106.2,232.75,2280.25,1.0,292.0,0.0,84.0,0.0,1.312,0.73
max,1614711.0,19509.0,16483.0,112.0,4190.0,272.0,95.7,17.5,4.505,inf


In [34]:
# sort the DataFrame by date in ascending order as covid_combined_final_sorted
covid_combined_final_df['Date'] = pd.to_datetime(covid_combined_final_df['Date'])
covid_combined_final_sorted = covid_combined_final_df.sort_values(by='Date')

In [35]:
# fill missing values with zero
covid_combined_final_sorted_filled = covid_combined_final_sorted.fillna(value=0)
# remove inf and replace with zero in column 'A'
covid_combined_final_sorted_filled['Daily mortality rate'] = covid_combined_final_sorted_filled['Daily mortality rate'].replace([np.inf, -np.inf], 0)

In [36]:
covid_combined_final_sorted_filled.head()

Unnamed: 0,Province,Date,Cumulative confirmed cases,Confirmed cases per day,Cumulative death cases,Deaths per day,Cumulative hospitalizations cases,hospitalizations per day,Cumulative vaccine coverage dose1,Vaccine coverage dose1 per day,Mortality rate,Daily mortality rate
0,Alberta,2020-12-19,89993,1306,851,10,777,18,0.0,0.0,0.946,0.766
3981,Northwest Territories,2020-12-19,24,0,0,0,0,0,0.0,0.0,0.0,0.0
2560,New Brunswick,2020-12-19,574,1,8,0,3,0,0.0,0.0,1.394,0.0
7880,Saskatchewan,2020-12-19,13329,252,115,8,127,6,0.1,0.1,0.863,3.175
8579,Yukon,2020-12-19,69,0,1,0,0,0,0.0,0.0,1.449,0.0


In [37]:
covid_combined_final_sorted_filled.to_csv("Covid19_data_project.csv", index=False)

In [38]:
covid_dataframe_df = covid_combined_final_sorted_filled
covid_dataframe_df.head(5)

Unnamed: 0,Province,Date,Cumulative confirmed cases,Confirmed cases per day,Cumulative death cases,Deaths per day,Cumulative hospitalizations cases,hospitalizations per day,Cumulative vaccine coverage dose1,Vaccine coverage dose1 per day,Mortality rate,Daily mortality rate
0,Alberta,2020-12-19,89993,1306,851,10,777,18,0.0,0.0,0.946,0.766
3981,Northwest Territories,2020-12-19,24,0,0,0,0,0,0.0,0.0,0.0,0.0
2560,New Brunswick,2020-12-19,574,1,8,0,3,0,0.0,0.0,1.394,0.0
7880,Saskatchewan,2020-12-19,13329,252,115,8,127,6,0.1,0.1,0.863,3.175
8579,Yukon,2020-12-19,69,0,1,0,0,0,0.0,0.0,1.449,0.0


In [39]:
# Assumptions made recovered data/Cured data was not available in the API so we assumed to use the cumulative hospitalizations cases to calculate the cumulative active cases
covid_dataframe_df['Cumulative Active Cases'] = covid_dataframe_df['Cumulative confirmed cases'] - (covid_dataframe_df['Cumulative death cases'] + covid_dataframe_df['Cumulative hospitalizations cases'])
covid_dataframe_df['Active Cases per day'] = covid_dataframe_df['Confirmed cases per day'] - (covid_dataframe_df['Deaths per day'] + covid_dataframe_df['hospitalizations per day'])
covid_dataframe_df.sample(5)

Unnamed: 0,Province,Date,Cumulative confirmed cases,Confirmed cases per day,Cumulative death cases,Deaths per day,Cumulative hospitalizations cases,hospitalizations per day,Cumulative vaccine coverage dose1,Vaccine coverage dose1 per day,Mortality rate,Daily mortality rate,Cumulative Active Cases,Active Cases per day
6842,Prince Edward Island,2021-10-08,306,0,0,0,0,0,80.7,0.0,0.0,0.0,306,0
6094,Ontario,2022-01-24,1027637,4653,11089,49,3924,23,81.8,0.0,1.079,1.053,1012624,4581
3891,Newfoundland and Labrador,2022-08-20,50742,53,221,0,3,0,95.3,0.0,0.436,0.0,50518,53
8657,Yukon,2021-03-07,86,0,1,0,0,0,33.8,0.0,1.163,0.0,85,0
3715,Newfoundland and Labrador,2022-02-25,23147,679,64,0,20,2,94.6,0.0,0.276,0.0,23063,677


In [40]:
Active_cases_per_province = covid_dataframe_df.groupby('Province').max()[['Cumulative Active Cases', 'Date']].sort_values(['Cumulative Active Cases'], ascending = False).reset_index()
# Active_cases_per_province = Active_cases_per_province.iloc[0:5]
Active_cases_per_province_df = Active_cases_per_province[['Province', 'Cumulative Active Cases']]
Active_cases_per_province_df = Active_cases_per_province_df.set_index('Province')
Active_cases_per_province_df.head(13)

Unnamed: 0_level_0,Cumulative Active Cases
Province,Unnamed: 1_level_1
Ontario,1597547
Quebec,1161158
Alberta,625662
British Columbia,393391
Manitoba,153062
Saskatchewan,145617
New Brunswick,83576
Nova Scotia,73053
Prince Edward Island,53893
Newfoundland and Labrador,52952
