In [14]:
import os
import random
import re
import requests
import time

In [15]:
from bs4 import BeautifulSoup
import cloudscraper
import geopandas as gpd
import pandas as pd

In [16]:
from shapely.geometry import Point
from tqdm.notebook import tqdm

## Get State Data

In [17]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

In [18]:
states_df = states_df[["STUSPS", "NAME", "geometry"]]

## Get Population data

In [19]:
state_populations = pd.read_excel(
    "data/NST-EST2024-POP.xlsx", sheet_name=None, engine="openpyxl"
)

In [20]:
state_populations_df = state_populations["NST-EST2024-POP"][
    [
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)",
        "Unnamed: 5",
    ]
]
state_populations_df = state_populations_df.rename(
    columns={
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)": "NAME",
        "Unnamed: 5": "POPULATION",
    }
)
state_populations_df["NAME"] = state_populations_df["NAME"].str[1:]

In [21]:
states_with_population_df = states_df.merge(state_populations_df, on="NAME", how="left")
states_with_population_df = states_with_population_df[
    ["STUSPS", "NAME", "POPULATION", "geometry"]
]

## Get Subway Data

In [22]:
def get_coords(url: str) -> Point:
    r = scraper.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    latitude_meta = soup.find("meta", {"itemprop": "latitude"})
    longitude_meta = soup.find("meta", {"itemprop": "longitude"})
    lat = float(latitude_meta.attrs["content"])
    lon = float(longitude_meta.attrs["content"])
    return Point(lon, lat)

In [23]:
scraper = cloudscraper.create_scraper()
url_root = "https://restaurants.subway.com"

In [24]:
count_dict = {}
store_list = []

for i in tqdm(range(len(states_df)), desc="Parsing States"):
    # Get State code
    state_store_list = []
    state_code = states_df.iloc[i]["STUSPS"].lower()

    # Skip Territories
    if state_code in ["pr", "as", "vi", "gu", "mp"]:
        continue

    # If File exists continue
    if os.path.isfile(f"data/states/{state_code}.gpkg"):
        continue

    state_url = f"{url_root}/united-states/{state_code}"
    state_r = scraper.get(state_url)

    if state_r.status_code != 200:
        print(states_df.iloc[i]["NAME"])
        print(state_url)
        continue

    soup = BeautifulSoup(state_r.text, "html.parser")
    locations_as = soup.find_all("a", {"class": "Directory-listLink"})

    # Parse all locations in a state
    for locations_a in tqdm(locations_as, desc=f"Parsing Locations In {state_code}"):
        href = locations_a.attrs["href"]
        count = int(re.findall(r"\d+", locations_a.attrs["data-count"])[0])
        location_url = href.replace("..", url_root)
        if count == 1:
            point = get_coords(location_url)
            store_dict = {
                "STATE": state_code,
                "geometry": point,
            }
            store_list.append(store_dict)
            state_store_list.append(store_dict)
        else:
            r_location = scraper.get(location_url)
            location_soup = BeautifulSoup(r_location.text, "html.parser")
            locations_divs = location_soup.find_all(
                "div", {"class": "Teaser-innerWrapper"}
            )

            # Multiple stores in locations, parse the stores per location
            for location_div in locations_divs:
                try:
                    location_a = location_div.find("a")
                    location_url = location_a.attrs["href"].replace("../..", url_root)
                    point = get_coords(location_url)
                    store_dict = {
                        "STATE": state_code,
                        "geometry": point,
                    }
                    store_list.append(store_dict)
                    state_store_list.append(store_dict)
                except Exception as e:
                    print(location_url)
                time.sleep(random.uniform(0.01, 0.5))

    if state_store_list:
        subway_state_gdf = gpd.GeoDataFrame(state_store_list, crs=4326)
        subway_state_gdf.to_file(f"data/states/{state_code}.gpkg")

    time.sleep(random.uniform(0.01, 0.25))

Parsing States:   0%|          | 0/56 [00:00<?, ?it/s]

Parsing Locations In ms:   0%|          | 0/142 [00:00<?, ?it/s]

Parsing Locations In nc:   0%|          | 0/302 [00:00<?, ?it/s]

Parsing Locations In ok:   0%|          | 0/163 [00:00<?, ?it/s]

Parsing Locations In va:   0%|          | 0/238 [00:00<?, ?it/s]

Parsing Locations In wv:   0%|          | 0/108 [00:00<?, ?it/s]

Parsing Locations In la:   0%|          | 0/153 [00:00<?, ?it/s]

Parsing Locations In mi:   0%|          | 0/384 [00:00<?, ?it/s]

Parsing Locations In ma:   0%|          | 0/174 [00:00<?, ?it/s]

Parsing Locations In id:   0%|          | 0/63 [00:00<?, ?it/s]

Parsing Locations In fl:   0%|          | 0/358 [00:00<?, ?it/s]

Parsing Locations In ne:   0%|          | 0/94 [00:00<?, ?it/s]

Parsing Locations In wa:   0%|          | 0/168 [00:00<?, ?it/s]

Parsing Locations In nm:   0%|          | 0/57 [00:00<?, ?it/s]

Parsing Locations In sd:   0%|          | 0/55 [00:00<?, ?it/s]

Parsing Locations In tx:   0%|          | 0/574 [00:00<?, ?it/s]

Parsing Locations In ca:   0%|          | 0/570 [00:00<?, ?it/s]

Parsing Locations In al:   0%|          | 0/206 [00:00<?, ?it/s]

Parsing Locations In ga:   0%|          | 0/280 [00:00<?, ?it/s]

Parsing Locations In pa:   0%|          | 0/373 [00:00<?, ?it/s]

Parsing Locations In mo:   0%|          | 0/249 [00:00<?, ?it/s]

Parsing Locations In co:   0%|          | 0/136 [00:00<?, ?it/s]

Parsing Locations In ut:   0%|          | 0/99 [00:00<?, ?it/s]

Parsing Locations In tn:   0%|          | 0/220 [00:00<?, ?it/s]

Parsing Locations In wy:   0%|          | 0/34 [00:00<?, ?it/s]

Parsing Locations In ny:   0%|          | 0/384 [00:00<?, ?it/s]

Parsing Locations In ks:   0%|          | 0/119 [00:00<?, ?it/s]

Parsing Locations In ak:   0%|          | 0/23 [00:00<?, ?it/s]

Parsing Locations In nv:   0%|          | 0/36 [00:00<?, ?it/s]

Parsing Locations In il:   0%|          | 0/434 [00:00<?, ?it/s]

Parsing Locations In vt:   0%|          | 0/36 [00:00<?, ?it/s]

Parsing Locations In mt:   0%|          | 0/44 [00:00<?, ?it/s]

Parsing Locations In ia:   0%|          | 0/174 [00:00<?, ?it/s]

Parsing Locations In sc:   0%|          | 0/151 [00:00<?, ?it/s]

Parsing Locations In nh:   0%|          | 0/59 [00:00<?, ?it/s]

Parsing Locations In az:   0%|          | 0/98 [00:00<?, ?it/s]

Parsing Locations In dc:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Locations In nj:   0%|          | 0/125 [00:00<?, ?it/s]

Parsing Locations In md:   0%|          | 0/173 [00:00<?, ?it/s]

Parsing Locations In me:   0%|          | 0/81 [00:00<?, ?it/s]

Parsing Locations In hi:   0%|          | 0/35 [00:00<?, ?it/s]

Parsing Locations In de:   0%|          | 0/18 [00:00<?, ?it/s]

Parsing Locations In ri:   0%|          | 0/29 [00:00<?, ?it/s]

Parsing Locations In ky:   0%|          | 0/182 [00:00<?, ?it/s]

Parsing Locations In oh:   0%|          | 0/441 [00:00<?, ?it/s]

Parsing Locations In wi:   0%|          | 0/267 [00:00<?, ?it/s]

Parsing Locations In or:   0%|          | 0/113 [00:00<?, ?it/s]

Parsing Locations In nd:   0%|          | 0/34 [00:00<?, ?it/s]

Parsing Locations In ar:   0%|          | 0/138 [00:00<?, ?it/s]

Parsing Locations In in:   0%|          | 0/253 [00:00<?, ?it/s]

Parsing Locations In mn:   0%|          | 0/241 [00:00<?, ?it/s]

Parsing Locations In ct:   0%|          | 0/128 [00:00<?, ?it/s]

In [46]:
subway_gdfs = []
states_path = "data/states"
for file in os.listdir("data/states"):
    constructed_path = os.path.join(states_path, file)
    if constructed_path.endswith(".gpkg"):
        subway_gdf = gpd.read_file(constructed_path)
        subway_gdfs.append(subway_gdf)

In [47]:
subway_gdf = gpd.GeoDataFrame(pd.concat(subway_gdfs, ignore_index=True))
subway_gdf = subway_gdf.to_crs(9311)
subway_gdf.to_file(f"data/stores.gpkg")

In [48]:
subway_by_state_series = subway_gdf.groupby(["STATE"]).size()

In [49]:
subways_by_state_df = pd.DataFrame(
    {
        "STUSPS": subway_by_state_series.index,
        "COUNT": subway_by_state_series.values,
    }
)

In [50]:
subways_by_state_df["STUSPS"] = subways_by_state_df["STUSPS"].str.upper()

In [51]:
subways_by_states_gdf = states_with_population_df.merge(
    subways_by_state_df, on="STUSPS", how="inner"
)

In [52]:
subways_by_states_gdf["per_1000"] = subways_by_states_gdf["COUNT"] / (
    subways_by_states_gdf["POPULATION"] / 1000
)
subways_by_states_gdf["per_10k"] = subways_by_states_gdf["COUNT"] / (
    subways_by_states_gdf["POPULATION"] / 10_000
)
subways_by_states_gdf["per_100k"] = (
    (subways_by_states_gdf["COUNT"] / (subways_by_states_gdf["POPULATION"] / 100000))
    .round(decimals=0)
    .astype(int)
)
subways_by_states_gdf["per_500k"] = subways_by_states_gdf["COUNT"] / (
    subways_by_states_gdf["POPULATION"] / 500_000
)
subways_by_states_gdf["per_1m"] = (
    (subways_by_states_gdf["COUNT"] / (subways_by_states_gdf["POPULATION"] / 1_000_000))
    .round(decimals=0)
    .astype(int)
)

In [53]:
subways_by_states_gdf = subways_by_states_gdf.dropna()

In [54]:
subways_by_states_gdf = subways_by_states_gdf.to_crs(9311)
subways_by_states_gdf.to_file(f"data/stores_by_states.gpkg")