In [1]:
import os
import random
import json
import time

In [2]:
import cloudscraper
from bs4 import BeautifulSoup
import geopandas as gpd
import pandas as pd

In [3]:
from shapely.geometry import Point
from tqdm.notebook import tqdm

## Get State Data

In [4]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

In [5]:
states_df = states_df[["STUSPS", "NAME", "geometry"]]

## Get Data

In [6]:
scraper = cloudscraper.create_scraper()

In [7]:
def get_store_point(store_url: str) -> dict:
    r = scraper.get(store_url)
    soup = BeautifulSoup(r.text, "html.parser")
    store_dict = json.loads(soup.find("script", {"type": "application/ld+json"}).text)
    point = Point(
        float(store_dict["geo"]["longitude"]),
        float(store_dict["geo"]["latitude"]),
    )
    return point

In [8]:
def get_urls(url: str) -> list:
    r = scraper.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    return [
        "https://locations.timhortons.com" + a.attrs["href"]
        for a in soup.find_all("a", {"class": "directory_links"})
    ]

In [10]:
state_urls = get_urls("https://locations.timhortons.com/en/")

# Parse all locations in a state
for state_url in tqdm(state_urls, desc="Parsing States"):
    state_store_list = []
    state_code = state_url.split("/")[-2].replace("-", " ").upper()

    # If File exists continue
    if os.path.isfile(f"data/states/{state_code}.gpkg"):
        continue

    city_urls = get_urls(state_url)

    if not city_urls:
        print(state_code)
        continue

    for city_url in tqdm(city_urls, desc=f"Parsing Locations in {state_code}"):
        store_urls = get_urls(city_url)
        if not store_urls:
            print("Error getting city url", city_url)
            continue

        for store_url in store_urls:
            try:
                point = get_store_point(store_url)
                store_dict = {"geometry": point, "STUSPS": state_code, "url": store_url}
            except Exception as e:
                print("Error getting store url", store_url, str(e))
                continue
            state_store_list.append(store_dict)
            time.sleep(random.uniform(0.01, 0.5))

    if state_store_list:
        store_gdf = gpd.GeoDataFrame(state_store_list, crs=4326)
        store_gdf.to_file(f"data/states/{state_code}.gpkg")

Parsing States:   0%|          | 0/14 [00:00<?, ?it/s]

Parsing Locations in GA:   0%|          | 0/6 [00:00<?, ?it/s]

Parsing Locations in IN:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations in KY:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations in ME:   0%|          | 0/6 [00:00<?, ?it/s]

Parsing Locations in MI:   0%|          | 0/110 [00:00<?, ?it/s]

Error getting store url https://locations.timhortons.com/en/mi/holly/4042-grange-hall-rd/ 'NoneType' object has no attribute 'text'
Error getting store url https://locations.timhortons.com/en/mi/monroe/404-s-monroe-street/ 'NoneType' object has no attribute 'text'


Parsing Locations in MO:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations in NJ:   0%|          | 0/11 [00:00<?, ?it/s]

Parsing Locations in NY:   0%|          | 0/113 [00:00<?, ?it/s]

Parsing Locations in OH:   0%|          | 0/72 [00:00<?, ?it/s]

Parsing Locations in PA:   0%|          | 0/11 [00:00<?, ?it/s]

Parsing Locations in TN:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Locations in TX:   0%|          | 0/5 [00:00<?, ?it/s]

Parsing Locations in VA:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations in WV:   0%|          | 0/8 [00:00<?, ?it/s]

In [11]:
store_gdfs = []
states_path = "data/states"
for file in os.listdir("data/states"):
    constructed_path = os.path.join(states_path, file)
    if constructed_path.endswith(".gpkg"):
        gdf = gpd.read_file(constructed_path)
        store_gdfs.append(gdf)

## Manually add the two it didn't find

In [13]:
extra_stores = [
    {
        "geometry": Point(-83.62539826489383, 42.80598630683016),
        "STUSPS": "MI",
        "url": "https://www.timhortons.com/store-locator/store/919251/4042-grange-hall-rd--holly--michigan--48442",
    },
    {
        "geometry": Point(-83.40114549202464, 41.913053485805534),
        "STUSPS": "MI",
        "url": "https://www.timhortons.com/store-locator/store/restaurant_71313",
    },
]

In [15]:
extra_stores_df = gpd.GeoDataFrame(extra_stores, crs=4326)
store_gdfs.append(extra_stores_df)

In [34]:
gdf = pd.concat(store_gdfs, ignore_index=True)

In [35]:
gdf = gdf.to_crs(32615)
gdf.to_file(f"data/stores.gpkg")

## Get Population Data

In [36]:
state_populations = pd.read_excel(
    "data/NST-EST2024-POP.xlsx", sheet_name=None, engine="openpyxl"
)

In [37]:
state_populations_df = state_populations["NST-EST2024-POP"][
    [
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)",
        "Unnamed: 5",
    ]
]
state_populations_df = state_populations_df.rename(
    columns={
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)": "NAME",
        "Unnamed: 5": "POPULATION",
    }
)
state_populations_df["NAME"] = state_populations_df["NAME"].str[1:]

In [38]:
states_with_population_df = states_df.merge(state_populations_df, on="NAME", how="left")
states_with_population_df = states_with_population_df[
    ["STUSPS", "NAME", "POPULATION", "geometry"]
]

In [39]:
states_with_population_df = states_with_population_df.dropna()

In [40]:
states_with_population_df["NAME"] = states_with_population_df["NAME"].str.lower()

## Merge Data

In [41]:
count_df = pd.DataFrame(gdf.groupby("STUSPS").size(), columns=["COUNT"]).reset_index()

In [42]:
count_gdf = states_with_population_df.merge(count_df, on="STUSPS", how="left")
count_gdf = count_gdf.fillna(0)

In [43]:
count_gdf["per_100k"] = (count_gdf["COUNT"] / (count_gdf["POPULATION"] / 100000)).round(
    decimals=2
)
count_gdf["per_1m"] = (
    count_gdf["COUNT"] / (count_gdf["POPULATION"] / 1_000_000)
).round(decimals=1)

In [44]:
count_gdf = count_gdf.to_crs(32615)

In [45]:
count_gdf.to_file("data/Tim_Hortons_Per_State.gpkg")

In [46]:
count_gdf[["COUNT", "per_100k", "per_1m", "NAME"]].sort_values(
    "per_100k", ascending=False
)

Unnamed: 0,COUNT,per_100k,per_1m,NAME
6,204.0,2.02,20.2,michigan
25,266.0,1.35,13.5,new york
44,129.0,1.09,10.9,ohio
4,10.0,0.56,5.6,west virginia
39,7.0,0.5,5.0,maine
19,19.0,0.15,1.5,pennsylvania
37,12.0,0.13,1.3,new jersey
18,6.0,0.05,0.5,georgia
43,2.0,0.04,0.4,kentucky
15,10.0,0.03,0.3,texas
