# 01 · Data Extraction & Transformation — Toronto 311 (Customer-Initiated)
Auto-discover yearly resources via CKAN, pull 2021–2025 YTD, clean, and export tidy CSVs.
**Defaults:** exclude `Canceled` statuses for demand analysis.

In [28]:
import pandas as pd
from collections import defaultdict

from pathlib import Path
DATA_DIR = Path("data")
DATA_DIR.mkdir(parents=True, exist_ok=True)

URLS = [
    "https://www.toronto.ca/data/311/opendata/servicerequest/SR2021.zip",
    "https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/2e54bc0e-4399-4076-b717-351df5918ae7/resource/f00a3313-f074-463e-89a7-26563084fbef/download/sr2022.zip",
    "https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/2e54bc0e-4399-4076-b717-351df5918ae7/resource/079766f3-815d-4257-8731-5ff6b0c84c13/download/sr2023.zip",
    "https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/2e54bc0e-4399-4076-b717-351df5918ae7/resource/f46b640d-d465-4f8b-9db5-5000a08295cd/download/sr2024.zip",
    "https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/2e54bc0e-4399-4076-b717-351df5918ae7/resource/f3db05ab-2588-4159-89f7-56c74d1d8201/download/sr2025.zip"
]

# Some years use slightly different column labels — handle that here
DATE_CANDS   = ["Service Request Creation Date and Time", "Creation Date", "SR Creation Date and Time"]
TYPE_CANDS   = ["Original Service Request Type", "Service Request Type", "Original Problem Type"]
STATUS_CANDS = ["Service Request Status", "Status"]
DIV_CANDS    = ["Service Request Division", "Division"]
WARD_CANDS   = ["Service Request Ward", "Ward"]

CHUNK = 100_000  # adjust if you like

def pick_col(cols, candidates):
    cset = {c.lower().strip(): c for c in cols}
    for cand in candidates:
        k = cand.lower().strip()
        if k in cset:
            return cset[k]
    return None

def stream_zip_one(url: str):
    """
    Reads a single-CSV ZIP directly from URL using pandas (compression='zip'), in chunks.
    Returns three small DataFrames:
      - counts_y_t         (year x type)
      - counts_y_t_d_w     (year x type x division x ward)  [empty if cols missing]
      - daily              (day totals)
    """
    agg_year_type = defaultdict(int)
    agg_year_type_div_ward = defaultdict(int)
    agg_daily = defaultdict(int)

    # First tiny peek to detect actual column names quickly
    head = pd.read_csv(
        url,
        compression="zip",
        nrows=5,
        low_memory=False,
        on_bad_lines="skip",
    )
    # Normalize stripped/lowercase to find canonical names
    head = head.rename(columns={c: c.strip() for c in head.columns})
    date_col   = pick_col(head.columns, DATE_CANDS)
    type_col   = pick_col(head.columns, TYPE_CANDS)
    status_col = pick_col(head.columns, STATUS_CANDS)
    div_col    = pick_col(head.columns, DIV_CANDS)
    ward_col   = pick_col(head.columns, WARD_CANDS)

    if not date_col or not type_col:
        raise ValueError(f"Required columns not found in {url}\nFound: {list(head.columns)}")

    usecols = [date_col, type_col]
    if status_col: usecols.append(status_col)
    if div_col:    usecols.append(div_col)
    if ward_col:   usecols.append(ward_col)

    it = pd.read_csv(
        url,
        compression="zip",
        usecols=lambda c: (c.strip() in usecols) if isinstance(c, str) else False,
        parse_dates=[date_col],
        on_bad_lines="skip",
        low_memory=False,
        chunksize=CHUNK,
        encoding="cp1252",
    )

    for df in it:
        df = df.rename(columns={c: c.strip() for c in df.columns})

        # Keep canceled rows as long as a type exists
        df = df.dropna(subset=[date_col, type_col])
        df[type_col] = df[type_col].astype(str).str.strip()
        df = df[df[type_col] != ""]  # drops empty-string types too

        years = df[date_col].dt.year.astype("Int64")
        grp1 = df.groupby([years, df[type_col]]).size()
        for (yr, typ), n in grp1.items():
            if pd.isna(yr): continue
            agg_year_type[(int(yr), str(typ))] += int(n)

        # Optional detailed grouping if division/ward exist
        if div_col in df.columns and ward_col in df.columns:
            grp2 = df.groupby([years, df[type_col], df[div_col].fillna(""), df[ward_col].fillna("")]).size()
            for (yr, typ, dv, wd), n in grp2.items():
                if pd.isna(yr): continue
                agg_year_type_div_ward[(int(yr), str(typ), str(dv), str(wd))] += int(n)

        # Daily totals (for forecasting later)
        days = df[date_col].dt.date
        grp3 = df.groupby(days).size()
        for d, n in grp3.items():
            agg_daily[d] += int(n)

    counts_y_t = (pd.Series(agg_year_type, name="n")
                    .rename_axis(["year","type"]).reset_index()
                    .sort_values(["year","n"], ascending=[True, False]))

    if agg_year_type_div_ward:
        counts_y_t_d_w = (pd.Series(agg_year_type_div_ward, name="n")
                            .rename_axis(["year","type","division","ward"]).reset_index()
                            .sort_values(["year","n"], ascending=[True, False]))
    else:
        counts_y_t_d_w = pd.DataFrame(columns=["year","type","division","ward","n"])

    daily = (pd.Series(agg_daily, name="n")
               .rename_axis("day").reset_index()
               .sort_values("day"))

    return counts_y_t, counts_y_t_d_w, daily

# ---- Run on all links and combine ----
all_counts, all_counts_detailed, all_daily = [], [], []
for u in URLS:
    print("Reading:", u)
    c1, c2, dly = stream_zip_one(u)
    all_counts.append(c1)
    if not c2.empty:
        all_counts_detailed.append(c2)
    all_daily.append(dly)

df_counts = (pd.concat(all_counts, ignore_index=True)
               .groupby(["year","type"], as_index=False)["n"].sum()
               .sort_values(["year","n"], ascending=[True, False]))

daily = (pd.concat(all_daily, ignore_index=True)
           .groupby("day", as_index=False)["n"].sum()
           .sort_values("day"))

if all_counts_detailed:
    df_counts_detailed = (pd.concat(all_counts_detailed, ignore_index=True)
                            .groupby(["year","type","division","ward"], as_index=False)["n"].sum()
                            .sort_values(["year","n"], ascending=[True, False]))
else:
    df_counts_detailed = pd.DataFrame(columns=["year","type","division","ward","n"])

# Top 15 per year
TOP_N = 15
top_by_year = (df_counts.groupby("year")
               .head(TOP_N)
               .reset_index(drop=True))

# Save outputs (small tidy files)
df_counts.to_csv(DATA_DIR / "311_counts_year_type_2021_2025.csv", index=False)
top_by_year.to_csv(DATA_DIR / "311_top_types_by_year_2021_2025.csv", index=False)
daily.to_csv(DATA_DIR / "311_daily_totals_2021_2025.csv", index=False)
df_counts_detailed.to_csv(DATA_DIR / "311_counts_year_type_division_ward_2021_2025.csv", index=False)


print("Wrote:")
print(" - 311_counts_year_type_2021_2025.csv")
print(" - 311_top_types_by_year_2021_2025.csv")
print(" - 311_daily_totals_2021_2025.csv")
print(" - 311_counts_year_type_division_ward_2021_2025.csv (may be empty if cols absent)")

Reading: https://www.toronto.ca/data/311/opendata/servicerequest/SR2021.zip
Reading: https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/2e54bc0e-4399-4076-b717-351df5918ae7/resource/f00a3313-f074-463e-89a7-26563084fbef/download/sr2022.zip
Reading: https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/2e54bc0e-4399-4076-b717-351df5918ae7/resource/079766f3-815d-4257-8731-5ff6b0c84c13/download/sr2023.zip
Reading: https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/2e54bc0e-4399-4076-b717-351df5918ae7/resource/f46b640d-d465-4f8b-9db5-5000a08295cd/download/sr2024.zip
Reading: https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/2e54bc0e-4399-4076-b717-351df5918ae7/resource/f3db05ab-2588-4159-89f7-56c74d1d8201/download/sr2025.zip
Wrote:
 - 311_counts_year_type_2021_2025.csv
 - 311_top_types_by_year_2021_2025.csv
 - 311_daily_totals_2021_2025.csv
 - 311_counts_year_type_division_ward_2021_2025.csv (may be empty if cols absent)
