## Load Toronto Data

In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import requests

In [2]:
# Get the dataset metadata by passing package_id to the package_search endpoint
# For example, to retrieve the metadata for this dataset:
 
url = "https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/package_show"
params = { "id": "64b54586-6180-4485-83eb-81e8fae3b8fe"}
package = requests.get(url, params = params).json()
print(package["result"])

{'license_title': 'Open Government Licence – Toronto', 'owner_unit': 'Communicable Disease Surveillance Unit', 'relationships_as_object': [], 'topics': 'Health', 'owner_email': 'cdsu@toronto.ca', 'excerpt': 'Line-listed report of COVID-19 cases among Toronto residents, including demographic, severity, geographical, and epidemiological variables.', 'private': False, 'owner_division': 'Toronto Public Health', 'num_tags': 7, 'id': '64b54586-6180-4485-83eb-81e8fae3b8fe', 'metadata_created': '2020-07-10T14:00:54.343339', 'refresh_rate': 'Weekly', 'title': 'COVID-19 Cases in Toronto', 'license_url': 'https://open.toronto.ca/open-data-license/', 'state': 'active', 'information_url': 'https://www.toronto.ca/home/covid-19/covid-19-latest-city-of-toronto-news/covid-19-status-of-cases-in-toronto/', 'license_id': 'open-government-licence-toronto', 'type': 'dataset', 'resources': [{'cache_last_updated': None, 'package_id': '64b54586-6180-4485-83eb-81e8fae3b8fe', 'datastore_active': True, 'id': 'e5b

In [3]:
# Get the data by passing the resource_id to the datastore_search endpoint
# See https://docs.ckan.org/en/latest/maintaining/datastore.html for detailed parameters options
# For example, to retrieve the data content for the first resource in the datastore:
 
for idx, resource in enumerate(package["result"]["resources"]):
    if resource["datastore_active"]:
        url = "https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/datastore_search"
        p = { "id": resource["id"] , "limit": 32000}
        data = requests.get(url, params = p).json()
        df = pd.DataFrame(data["result"]["records"])

In [4]:
df.head()

Unnamed: 0,_id,Outbreak Associated,Age Group,Neighbourhood Name,FSA,Source of Infection,Classification,Episode Date,Reported Date,Client Gender,Outcome,Currently Hospitalized,Currently in ICU,Currently Intubated,Ever Hospitalized,Ever in ICU,Ever Intubated
0,23613,Sporadic,40-49,Dovercourt-Wallace Emerson-Junction,M6H,Community,CONFIRMED,2020-05-26,2020-06-03,MALE,RESOLVED,No,No,No,No,No,No
1,23614,Sporadic,20-29,Dufferin Grove,M6H,Community,CONFIRMED,2020-04-24,2020-04-27,MALE,RESOLVED,No,No,No,No,No,No
2,23615,Sporadic,30-39,Dufferin Grove,M6H,Close contact,CONFIRMED,2020-04-27,2020-04-28,MALE,RESOLVED,No,No,No,No,No,No
3,23616,Sporadic,30-39,Dufferin Grove,M6H,Close contact,CONFIRMED,2020-04-21,2020-05-01,MALE,RESOLVED,No,No,No,No,No,No
4,23617,Sporadic,60-69,Dovercourt-Wallace Emerson-Junction,M6H,Pending,CONFIRMED,2020-04-15,2020-04-15,FEMALE,RESOLVED,No,No,No,Yes,Yes,No


In [14]:
# Renaming Column Name (Name suitable for PostgreSQL)
new_column_name = ['ID', 'OutbreakAssociated', 'AgeGroup', 'NeighbourhoodName', 'FSA', 
                   'InfectionSource', 'Classification', 'EpisodeDate', 'ReportedDate', 'Gender', 'Outcome',
                   'CurrentlyHospitalized', 'CurrentlyICU', 'CurrentlyIntubated', 'EverHospitalized', 'EverICU', 'EverIntubated']

df.columns = new_column_name
df.head()

Unnamed: 0,ID,OutbreakAssociated,AgeGroup,NeighbourhoodName,FSA,InfectionSource,Classification,EpisodeDate,ReportedDate,Gender,Outcome,CurrentlyHospitalized,CurrentlyICU,CurrentlyIntubated,EverHospitalized,EverICU,EverIntubated
0,23613,Sporadic,40-49,Dovercourt-Wallace Emerson-Junction,M6H,Community,CONFIRMED,2020-05-26,2020-06-03,MALE,RESOLVED,No,No,No,No,No,No
1,23614,Sporadic,20-29,Dufferin Grove,M6H,Community,CONFIRMED,2020-04-24,2020-04-27,MALE,RESOLVED,No,No,No,No,No,No
2,23615,Sporadic,30-39,Dufferin Grove,M6H,Close contact,CONFIRMED,2020-04-27,2020-04-28,MALE,RESOLVED,No,No,No,No,No,No
3,23616,Sporadic,30-39,Dufferin Grove,M6H,Close contact,CONFIRMED,2020-04-21,2020-05-01,MALE,RESOLVED,No,No,No,No,No,No
4,23617,Sporadic,60-69,Dovercourt-Wallace Emerson-Junction,M6H,Pending,CONFIRMED,2020-04-15,2020-04-15,FEMALE,RESOLVED,No,No,No,Yes,Yes,No


In [15]:
# Changing Data Type
df['EpisodeDate']= pd.to_datetime(df['EpisodeDate'])
df['ReportedDate']= pd.to_datetime(df['ReportedDate'])

## Write Data to PostgreSQL DB

In [9]:
from sqlalchemy import create_engine
from config import db_password

In [10]:
db_string = f"postgres://postgres:{db_password}@module20covid.cgcfmenzscpu.us-east-2.rds.amazonaws.com:5432/postgres"
db = create_engine(db_string)

In [13]:
df.to_sql(name='Toronto_Cases', con=db, if_exists='replace')