## Load Toronto Data

In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import requests

In [2]:
# Get the dataset metadata by passing package_id to the package_search endpoint
# For example, to retrieve the metadata for this dataset:
 
url = "https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/package_show"
params = { "id": "64b54586-6180-4485-83eb-81e8fae3b8fe"}
package = requests.get(url, params = params).json()
print(package["result"])

{'license_title': 'Open Government Licence – Toronto', 'owner_unit': 'Communicable Disease Surveillance Unit', 'relationships_as_object': [], 'topics': 'Health', 'owner_email': 'cdsu@toronto.ca', 'excerpt': 'Line-listed report of COVID-19 cases among Toronto residents, including demographic, severity, geographical, and epidemiological variables.', 'private': False, 'owner_division': 'Toronto Public Health', 'num_tags': 7, 'id': '64b54586-6180-4485-83eb-81e8fae3b8fe', 'metadata_created': '2020-07-10T14:00:54.343339', 'refresh_rate': 'Weekly', 'title': 'COVID-19 Cases in Toronto', 'license_url': 'https://open.toronto.ca/open-data-license/', 'state': 'active', 'information_url': 'https://www.toronto.ca/home/covid-19/covid-19-latest-city-of-toronto-news/covid-19-status-of-cases-in-toronto/', 'license_id': 'open-government-licence-toronto', 'type': 'dataset', 'resources': [{'cache_last_updated': None, 'package_id': '64b54586-6180-4485-83eb-81e8fae3b8fe', 'datastore_active': True, 'id': 'e5b

In [3]:
# Get the data by passing the resource_id to the datastore_search endpoint
# See https://docs.ckan.org/en/latest/maintaining/datastore.html for detailed parameters options
# For example, to retrieve the data content for the first resource in the datastore:
 
for idx, resource in enumerate(package["result"]["resources"]):
    if resource["datastore_active"]:
        url = "https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/datastore_search"
        p = { "id": resource["id"] , "limit": 32000}
        data = requests.get(url, params = p).json()
        df = pd.DataFrame(data["result"]["records"])

In [4]:
df

Unnamed: 0,_id,Outbreak Associated,Age Group,Neighbourhood Name,FSA,Source of Infection,Classification,Episode Date,Reported Date,Client Gender,Outcome,Currently Hospitalized,Currently in ICU,Currently Intubated,Ever Hospitalized,Ever in ICU,Ever Intubated
0,44294,Sporadic,50-59,Malvern,M1B,Institutional,CONFIRMED,2020-03-25,2020-03-27,MALE,RESOLVED,No,No,No,No,No,No
1,44295,Sporadic,20-29,Malvern,M1B,Community,CONFIRMED,2020-03-20,2020-03-28,MALE,RESOLVED,No,No,No,Yes,No,No
2,44296,Sporadic,60-69,Malvern,M1B,Travel,CONFIRMED,2020-03-04,2020-03-08,FEMALE,RESOLVED,No,No,No,Yes,Yes,Yes
3,44297,Outbreak Associated,50-59,Rouge,M1B,N/A - Outbreak associated,CONFIRMED,2020-05-02,2020-05-04,FEMALE,RESOLVED,No,No,No,No,No,No
4,44298,Sporadic,30-39,Rouge,M1B,Close contact,CONFIRMED,2020-05-31,2020-06-06,FEMALE,RESOLVED,No,No,No,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14906,59200,Outbreak Associated,50-59,,,N/A - Outbreak associated,CONFIRMED,2020-06-11,2020-06-15,FEMALE,RESOLVED,No,No,No,No,No,No
14907,59201,Outbreak Associated,20-29,,,N/A - Outbreak associated,CONFIRMED,2020-05-09,2020-05-23,FEMALE,RESOLVED,No,No,No,No,No,No
14908,59202,Outbreak Associated,40-49,,,N/A - Outbreak associated,CONFIRMED,2020-06-18,2020-06-19,FEMALE,RESOLVED,No,No,No,No,No,No
14909,59203,Outbreak Associated,19 and younger,,,N/A - Outbreak associated,PROBABLE,2020-06-13,2020-06-13,MALE,RESOLVED,No,No,No,No,No,No


## Clean Up of Dataset

In [5]:
# Renaming Column Name (Name suitable for PostgreSQL)
new_column_name = ['id', 'outbreak_associated', 'age_group', 'neighbourhood_name', 'fsa', 
                   'infection_source', 'classification', 'episode_date', 'reported_date', 'gender', 'outcome',
                   'currently_hospitalized', 'currently_in_icu', 'currently_intubated', 'ever_hospitalized', 'ever_in_icu', 'ever_intubated']

df.columns = new_column_name
df.head(2)

Unnamed: 0,id,outbreak_associated,age_group,neighbourhood_name,fsa,infection_source,classification,episode_date,reported_date,gender,outcome,currently_hospitalized,currently_in_icu,currently_intubated,ever_hospitalized,ever_in_icu,ever_intubated
0,44294,Sporadic,50-59,Malvern,M1B,Institutional,CONFIRMED,2020-03-25,2020-03-27,MALE,RESOLVED,No,No,No,No,No,No
1,44295,Sporadic,20-29,Malvern,M1B,Community,CONFIRMED,2020-03-20,2020-03-28,MALE,RESOLVED,No,No,No,Yes,No,No


In [6]:
# Changing Data Type
df['episode_date']= pd.to_datetime(df['episode_date'])
df['reported_date']= pd.to_datetime(df['reported_date'])

In [7]:
# Remove Data without Age Group
df = df[df['age_group'] != '']

In [8]:
# Remove Data without Gender
df = df[(df['gender'] != 'OTHER') | (df['gender'] != 'UNKNOWN')]

In [9]:
# Remove Unconfirmed Cases
df = df[df['classification'] != 'PROBABLE']

In [10]:
df

Unnamed: 0,id,outbreak_associated,age_group,neighbourhood_name,fsa,infection_source,classification,episode_date,reported_date,gender,outcome,currently_hospitalized,currently_in_icu,currently_intubated,ever_hospitalized,ever_in_icu,ever_intubated
0,44294,Sporadic,50-59,Malvern,M1B,Institutional,CONFIRMED,2020-03-25,2020-03-27,MALE,RESOLVED,No,No,No,No,No,No
1,44295,Sporadic,20-29,Malvern,M1B,Community,CONFIRMED,2020-03-20,2020-03-28,MALE,RESOLVED,No,No,No,Yes,No,No
2,44296,Sporadic,60-69,Malvern,M1B,Travel,CONFIRMED,2020-03-04,2020-03-08,FEMALE,RESOLVED,No,No,No,Yes,Yes,Yes
3,44297,Outbreak Associated,50-59,Rouge,M1B,N/A - Outbreak associated,CONFIRMED,2020-05-02,2020-05-04,FEMALE,RESOLVED,No,No,No,No,No,No
4,44298,Sporadic,30-39,Rouge,M1B,Close contact,CONFIRMED,2020-05-31,2020-06-06,FEMALE,RESOLVED,No,No,No,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14905,59199,Outbreak Associated,50-59,,,N/A - Outbreak associated,CONFIRMED,2020-06-02,2020-06-03,FEMALE,RESOLVED,No,No,No,No,No,No
14906,59200,Outbreak Associated,50-59,,,N/A - Outbreak associated,CONFIRMED,2020-06-11,2020-06-15,FEMALE,RESOLVED,No,No,No,No,No,No
14907,59201,Outbreak Associated,20-29,,,N/A - Outbreak associated,CONFIRMED,2020-05-09,2020-05-23,FEMALE,RESOLVED,No,No,No,No,No,No
14908,59202,Outbreak Associated,40-49,,,N/A - Outbreak associated,CONFIRMED,2020-06-18,2020-06-19,FEMALE,RESOLVED,No,No,No,No,No,No


## Write Data to PostgreSQL DB

In [24]:
from sqlalchemy import create_engine
import sys
sys.path.append('../')
from config import db_password

In [12]:
db_string = f"postgres://postgres:{db_password}@module20covid.cgcfmenzscpu.us-east-2.rds.amazonaws.com:5432/postgres"
db = create_engine(db_string)

In [13]:
df.to_sql(name='Toronto_Cases', con=db, if_exists='replace')