In [28]:
import os
import random
import json
import time

In [29]:
import cloudscraper
from bs4 import BeautifulSoup
import geopandas as gpd
import pandas as pd

In [30]:
from shapely.geometry import Point
from tqdm.notebook import tqdm

## Get State Data

In [31]:
file_path = "data/lecu000e21a_e/lpr_000e21a_e.shp"
states_df = gpd.read_file(file_path)

In [32]:
states_df = states_df[["PRENAME", "geometry"]]
states_df = states_df.rename(columns={"PRENAME": "Geography"})

## Get Population data

In [33]:
state_populations = pd.read_csv("data/1710000901-noSymbol.csv")

In [34]:
states_population_gdf = states_df.merge(state_populations, on="Geography", how="left")
states_population_gdf = states_population_gdf.rename(columns={"Q1 2025": "POPULATION"})
states_population_gdf["POPULATION"] = (
    states_population_gdf["POPULATION"].str.replace(",", "").astype(int)
)

## Get Data

In [35]:
scraper = cloudscraper.create_scraper()

In [36]:
def get_store_point(store_url: str) -> dict:
    r = scraper.get(store_url)
    soup = BeautifulSoup(r.text, "html.parser")
    store_dict = json.loads(soup.find("script", {"type": "application/ld+json"}).text)
    point = Point(
        float(store_dict["geo"]["longitude"]),
        float(store_dict["geo"]["latitude"]),
    )
    return point

In [37]:
def get_urls(url: str) -> list:
    r = scraper.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    return [
        "https://locations.timhortons.ca" + a.attrs["href"]
        for a in soup.find_all("a", {"class": "directory_links"})
    ]

In [38]:
state_urls = get_urls("https://locations.timhortons.ca/en/")

# Parse all locations in a state
for state_url in tqdm(state_urls, desc="Parsing States"):
    state_store_list = []
    state_code = state_url.split("/")[-2].replace("-", " ").upper()

    # If File exists continue
    if os.path.isfile(f"data/states/{state_code}.gpkg"):
        continue

    city_urls = get_urls(state_url)

    if not city_urls:
        print(state_code)
        continue

    for city_url in tqdm(city_urls, desc=f"Parsing Locations in {state_code}"):
        store_urls = get_urls(city_url)
        if not store_urls:
            print("Error getting city url", city_url)
            continue

        for store_url in store_urls:
            try:
                point = get_store_point(store_url)
                store_dict = {"geometry": point, "STUSPS": state_code, "url": store_url}
            except Exception as e:
                print("Error getting store url", store_url, str(e))
                continue
            state_store_list.append(store_dict)
            time.sleep(random.uniform(0.01, 0.5))

    if state_store_list:
        store_gdf = gpd.GeoDataFrame(state_store_list, crs=4326)
        store_gdf.to_file(f"data/states/{state_code}.gpkg")

Parsing States:   0%|          | 0/13 [00:00<?, ?it/s]

In [39]:
store_gdfs = []
states_path = "data/states"
for file in os.listdir("data/states"):
    constructed_path = os.path.join(states_path, file)
    if constructed_path.endswith(".gpkg"):
        gdf = gpd.read_file(constructed_path)
        store_gdfs.append(gdf)

In [40]:
gdf = pd.concat(store_gdfs, ignore_index=True)

In [41]:
gdf = gdf.to_crs(3348)
gdf.to_file(f"data/stores.gpkg")

In [50]:
gdf["STUSPS"] = gdf["STUSPS"].str.lower()

In [51]:
th_by_state_series = gdf.groupby("STUSPS").size()

In [52]:
tm_by_state_df = pd.DataFrame(
    {
        "STUSPS": th_by_state_series.index,
        "COUNT": th_by_state_series.values,
    }
)

## Merge Data

In [54]:
province_ref_dict = {
    "Newfoundland and Labrador": "nl",
    "Prince Edward Island": "pe",
    "Nova Scotia": "ns",
    "New Brunswick": "nb",
    "Quebec": "qc",
    "Ontario": "on",
    "Manitoba": "mb",
    "Saskatchewan": "sk",
    "Alberta": "ab",
    "British Columbia": "bc",
    "Yukon": "yt",
    "Northwest Territories": "nt",
    "Nunavut": "nu",
}

In [55]:
ref_df = pd.DataFrame(province_ref_dict.items(), columns=["Geography", "STUSPS"])

In [56]:
count_gdf = states_population_gdf.merge(
    tm_by_state_df.merge(ref_df, on="STUSPS", how="left"),
    on="Geography",
    how="left",
)

In [57]:
count_gdf = count_gdf.fillna(0)

In [59]:
count_gdf["per_100k"] = (count_gdf["COUNT"] / (count_gdf["POPULATION"] / 100000)).round(
    decimals=2
)
count_gdf["per_1m"] = (
    count_gdf["COUNT"] / (count_gdf["POPULATION"] / 1_000_000)
).round(decimals=1)

In [60]:
count_gdf = count_gdf.to_crs(3348)

In [61]:
count_gdf.to_file("data/Tim_Hortons_Per_Province.gpkg")

In [63]:
count_gdf[["COUNT", "per_100k", "per_1m", "STUSPS"]].sort_values(
    "per_100k", ascending=False
)

Unnamed: 0,COUNT,per_100k,per_1m,STUSPS
12,12,28.98,289.8,nu
2,163,15.1,151.0,ns
3,112,13.04,130.4,nb
1,23,12.83,128.3,pe
0,68,12.46,124.6,nl
5,1979,12.23,122.3,on
8,434,8.75,87.5,ab
6,124,8.24,82.4,mb
7,100,7.99,79.9,sk
4,619,6.79,67.9,qc
