# Open Data Toronto

Snippet taken from the ["For Developers" tab](https://open.toronto.ca/dataset/311-service-requests-customer-initiated/)

In [52]:
import requests
import time

# Toronto Open Data is stored in a CKAN instance. It's APIs are documented here:
# https://docs.ckan.org/en/latest/api/

# To hit our API, you'll be making requests to:
base_url = "https://ckan0.cf.opendata.inter.prod-toronto.ca"

# Datasets are called "packages". Each package can contain many "resources"
# To retrieve the metadata for this package and its resources, use the package name in this page's URL:
url = base_url + "/api/3/action/package_show"
params = {"id": "311-service-requests-customer-initiated"}
package = requests.get(url, params=params).json()

# To get resource data:
for idx, resource in enumerate(package["result"]["resources"]):
    if idx > 3:
        break
    if idx == 0:
        print(resource.keys())
    print("resource name: ", resource["name"])
    print("datastore active: ", resource["datastore_active"])
    print(resource["url"])
    # To get metadata for non datastore_active resources:
    if not resource["datastore_active"]:
        url = base_url + "/api/3/action/resource_show?id=" + resource["id"]
        resource_metadata = requests.get(url).json()
        # print(resource_metadata)
        # From here, you can use the "url" attribute to download this file
        print("name: ", resource_metadata["result"]["name"])
        print("url: ", resource_metadata["result"]["url"])
        time.sleep(0.5)

dict_keys(['cache_last_updated', 'cache_url', 'created', 'datastore_active', 'format', 'hash', 'id', 'is_datastore_cache_file', 'is_preview', 'last_modified', 'metadata_modified', 'mimetype', 'mimetype_inner', 'name', 'package_id', 'position', 'resource_type', 'revision_id', 'size', 'state', 'url', 'url_type'])
resource name:  311-service-requests-readme
datastore active:  False
https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/2e54bc0e-4399-4076-b717-351df5918ae7/resource/bb11ece8-a2b3-401a-92c0-fa115f867fff/download/311-service-requests-readme.xlsx
name:  311-service-requests-readme
url:  https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/2e54bc0e-4399-4076-b717-351df5918ae7/resource/bb11ece8-a2b3-401a-92c0-fa115f867fff/download/311-service-requests-readme.xlsx
resource name:  311 Service Requests 2023
datastore active:  False
https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/2e54bc0e-4399-4076-b717-351df5918ae7/resource/079766f3-815d-4257-8731-5ff6b0c84c13/download/

In [53]:
foo = "311 Service Requests 2023"
faa = "311-service-requests-2023"
print("2023" in foo)
print("2023" in faa)

True
True


In [55]:
BASE_URL = "https://ckan0.cf.opendata.inter.prod-toronto.ca/api/3/action/"


def get_package_metadata(
    action: str = "package_show",
    resource_id: str = "311-service-requests-customer-initiated",
):
    params = {"id": resource_id}
    package = requests.get(BASE_URL + action, params=params, timeout=5).json()
    return package


def parse_url(year: str = "2020") -> str:
    """Retrieve URL of 311 service call data given the year"""
    resource_metadata = get_package_metadata()["result"]["resources"]
    url = [
        resource["url"] for resource in resource_metadata if year in resource["name"]
    ][0]
    return url

In [56]:
parse_url()

'https://www.toronto.ca/data/311/opendata/servicerequest/SR2020.zip'

In [None]:
from pathlib import Path

In [57]:
foo = "https://www.toronto.ca/data/311/opendata/servicerequest/SR2020.zip"

In [20]:
foo.split("/")[-1]

['https:',
 '',
 'www.toronto.ca',
 'data',
 '311',
 'opendata',
 'servicerequest',
 'SR2020.zip']

In [59]:
fn = foo.split("/")[-1].replace("zip", "csv")
fn

'SR2020.csv'

In [21]:
bucket = Path("../data/notebooks")
if not bucket.exists():
    bucket.mkdir(parents=True)
with requests.get(url=foo, stream=True, timeout=4) as dump:
    dump.raise_for_status()
    fpath = bucket / foo.split("/")[-1]
    with open(fpath, "wb") as f:
        for chunk in dump.iter_content(chunk_size=512 * 1024):
            f.write(chunk)

`zipfile` is a low-level library to manipulate compressed files

In [22]:
import zipfile

In [24]:
fpath = bucket / foo.split("/")[-1]
with zipfile.ZipFile(file=fpath, mode="r") as zip_ref:
    zip_ref.extractall(bucket)

`shutil` has higher-level functions in `make_archive` and `unpack_archive` to handle most general cases

In [25]:
import shutil

In [26]:
shutil.unpack_archive(filename=fpath, extract_dir=Path.cwd())

In [None]:
def fetch_from_url(url: str, fname: str):
    with requests.get(url=url, stream=True, timeout=5) as resource_dump_data:
        resource_dump_data.raise_for_status()

In [1]:
import tempfile

In [2]:
with tempfile.TemporaryDirectory() as tmpdir:
    print("temp dir:", tmpdir)
    print("type: ", type(tmpdir))

temp dir: /tmp/tmpqo1warik
type:  <class 'str'>


In [3]:
import pandas as pd

In [41]:
df = pd.read_csv("../tests/resources/SR2020.csv", nrows=100)

In [5]:
df.dtypes

Creation Date                   object
Status                          object
First 3 Chars of Postal Code    object
Intersection Street 1           object
Intersection Street 2           object
Ward                            object
Service Request Type            object
Division                        object
Section                         object
dtype: object

In [6]:
df.head()

Unnamed: 0,Creation Date,Status,First 3 Chars of Postal Code,Intersection Street 1,Intersection Street 2,Ward,Service Request Type,Division,Section
0,2020-01-01 00:01:46.0000000,Closed,Intersection,Chaplin Cres,Davisville Ave,Toronto-St. Paul's (12),INJUR/DIST DOMESTIC,Municipal Licensing & Standards,Toronto Animal Services
1,2020-01-01 01:36:56.0000000,Closed,M4C,,,Beaches-East York (19),Sewer Service Line-Blocked,Toronto Water,District Ops
2,2020-01-01 02:42:16.0000000,Closed,M9A,,,Etobicoke-Lakeshore (03),ENF/INVEST MUZZLE,Municipal Licensing & Standards,Toronto Animal Services
3,2020-01-01 02:52:29.0000000,Closed,M6C,,,Toronto-St. Paul's (12),Hydrant-Damage,Toronto Water,District Ops
4,2020-01-01 02:56:34.0000000,Closed,Intersection,Don Mills Rd,Sheppard Ave E,Don Valley North (17),Traffic Signal Maintenance,Transportation Services,TMC


In [42]:
df["creation_datetime"] = pd.to_datetime(df["Creation Date"])
df["creation_datetime"].dtype

dtype('<M8[ns]')

In [9]:
import re

In [8]:
df["Ward"].sample(10)

23        Don Valley East (16)
21       Don Valley North (17)
66     Parkdale-High Park (04)
68      Beaches-East York (19)
74        Don Valley West (15)
55     Toronto-St. Paul's (12)
77        Don Valley West (15)
98     Toronto-St. Paul's (12)
71    Etobicoke-Lakeshore (03)
83    Etobicoke-Lakeshore (03)
Name: Ward, dtype: object

In [13]:
wardname = "Valley East (16)"
idx = wardname.index("(")
ward_id = wardname[idx + 1 : idx + 3]
ward_id

'16'

In [15]:
p = re.compile(".*\(([0-9]{2})\)")
m = p.match(wardname)
m.group(1)

'16'

In [17]:
re.match(pattern=".*\(([0-9]{2})\)", string=wardname).group(1)

'16'

In [19]:
def extract_ward_id(ward: str):
    return int(re.match(pattern=".*\(([0-9]{2})\)", string=ward).group(1))

In [20]:
%%timeit
df["Ward"].apply(extract_ward_id)

282 µs ± 2.78 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [21]:
def extract_idx(ward: str):
    idx = ward.index("(")
    return int(ward[idx + 1 : idx + 3])

In [22]:
%%timeit
df["Ward"].apply(extract_idx)

161 µs ± 972 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [35]:
def extract_name_id(ward: str):
    idx = ward.index("(")
    ward_name = ward[: idx - 1]
    ward_id = int(ward[idx + 1 : idx + 3])
    return pd.Series([ward_name, ward_id], ["ward_name", "ward_id"])

In [33]:
df["Ward"].apply(extract_name_id)

TypeError: Field elements must be 2- or 3-tuples, got ''string''

In [30]:
df.apply(lambda row: extract_name_id(row["Ward"]), result_type="expand", axis=1)

Unnamed: 0,ward_name,ward_id
0,Toronto-St. Paul's,12
1,Beaches-East York,19
2,Etobicoke-Lakeshore,3
3,Toronto-St. Paul's,12
4,Don Valley North,17
...,...,...
95,Davenport,9
96,Parkdale-High Park,4
97,Toronto-Danforth,14
98,Toronto-St. Paul's,12


In [43]:
ward_ids = (
    df["Ward"].apply(extract_name_id).astype({"ward_name": "string", "ward_id": "Int8"})
)
df_drop = df.drop(columns=["Creation Date", "Ward"]).astype("string")

In [46]:
df_union = pd.concat([df_drop, ward_ids], axis=1)
df_union.head()

Unnamed: 0,Status,First 3 Chars of Postal Code,Intersection Street 1,Intersection Street 2,Service Request Type,Division,Section,creation_datetime,ward_name,ward_id
0,Closed,Intersection,Chaplin Cres,Davisville Ave,INJUR/DIST DOMESTIC,Municipal Licensing & Standards,Toronto Animal Services,2020-01-01 00:01:46,Toronto-St. Paul's,12
1,Closed,M4C,,,Sewer Service Line-Blocked,Toronto Water,District Ops,2020-01-01 01:36:56,Beaches-East York,19
2,Closed,M9A,,,ENF/INVEST MUZZLE,Municipal Licensing & Standards,Toronto Animal Services,2020-01-01 02:42:16,Etobicoke-Lakeshore,3
3,Closed,M6C,,,Hydrant-Damage,Toronto Water,District Ops,2020-01-01 02:52:29,Toronto-St. Paul's,12
4,Closed,Intersection,Don Mills Rd,Sheppard Ave E,Traffic Signal Maintenance,Transportation Services,TMC,2020-01-01 02:56:34,Don Valley North,17


In [47]:
df_union.dtypes

Status                          string
First 3 Chars of Postal Code    string
Intersection Street 1           string
Intersection Street 2           string
Service Request Type            string
Division                        string
Section                         string
creation_datetime               string
ward_name                       string
ward_id                           Int8
dtype: object

In [39]:
df_union["ward_id"] = df_union["ward_id"].astype("Int8")
df_union.dtypes

Status                          string
First 3 Chars of Postal Code    string
Intersection Street 1           string
Intersection Street 2           string
Service Request Type            string
Division                        string
Section                         string
ward_name                       object
ward_id                           Int8
dtype: object

In [62]:
df_read = pd.read_parquet("../data/notebooks/SR2020.parquet")
df_read.head()

Unnamed: 0,Status,First 3 Chars of Postal Code,Intersection Street 1,Intersection Street 2,Service Request Type,Division,Section,creation_datetime,ward_name,ward_id
0,Closed,Intersection,Chaplin Cres,Davisville Ave,INJUR/DIST DOMESTIC,Municipal Licensing & Standards,Toronto Animal Services,2020-01-01 00:01:46,Toronto-St. Paul's,12
1,Closed,M4C,,,Sewer Service Line-Blocked,Toronto Water,District Ops,2020-01-01 01:36:56,Beaches-East York,19
2,Closed,M9A,,,ENF/INVEST MUZZLE,Municipal Licensing & Standards,Toronto Animal Services,2020-01-01 02:42:16,Etobicoke-Lakeshore,3
3,Closed,M6C,,,Hydrant-Damage,Toronto Water,District Ops,2020-01-01 02:52:29,Toronto-St. Paul's,12
4,Closed,Intersection,Don Mills Rd,Sheppard Ave E,Traffic Signal Maintenance,Transportation Services,TMC,2020-01-01 02:56:34,Don Valley North,17


In [64]:
df_read.dtypes

Status                          string
First 3 Chars of Postal Code    string
Intersection Street 1           string
Intersection Street 2           string
Service Request Type            string
Division                        string
Section                         string
creation_datetime               string
ward_name                       string
ward_id                           Int8
dtype: object

In [65]:
df_read["ward_id"].dtype

Int8Dtype()