In [30]:
import re

In [2]:
from bs4 import BeautifulSoup
import cloudscraper
import geopandas as gpd
import pandas as pd

## Get State Data

In [20]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

In [21]:
states_df = states_df[["STUSPS", "NAME", "geometry"]]

## Get Population data

In [22]:
state_populations = pd.read_excel(
    "data/NST-EST2023-POP.xlsx", sheet_name=None, engine="openpyxl"
)

In [23]:
state_populations_df = state_populations["NST-EST2023-POP"][
    [
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)",
        "Unnamed: 5",
    ]
]
state_populations_df = state_populations_df.rename(
    columns={
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)": "NAME",
        "Unnamed: 5": "POPULATION",
    }
)
state_populations_df["NAME"] = state_populations_df["NAME"].str[1:]

In [24]:
states_with_population_df = states_df.merge(state_populations_df, on="NAME", how="left")
states_with_population_df = states_with_population_df[
    ["STUSPS", "NAME", "POPULATION", "geometry"]
]

## Get Subway Data

In [39]:
scraper = cloudscraper.create_scraper()

In [40]:
url = "https://restaurants.subway.com/united-states"

In [41]:
r = scraper.get(url)

In [42]:
soup = BeautifulSoup(r.text, "html.parser")
lis = soup.find_all("li", {"class": "Directory-listItem"})

In [44]:
count_dict = {}
for li in lis:
    a = li.find("a")
    data_count = numbers = re.findall(r"\d+", a.attrs["data-count"])
    name = a.text
    try:
        count_dict[name] = int(data_count[0])
    except IndexError:
        print(li)

<li class="Directory-listItem js-unspecified-item is-hidden"><a class="Directory-listLink js-unspecified-link" data-count="" data-ya-track="todirectory" href="united-states/additional-locations"><span class="Directory-listLinkText">Additional Locations</span></a></li>


In [48]:
count_df = pd.DataFrame(count_dict.items(), columns=["NAME", "COUNT"])

## Combine With States

In [60]:
states_with_population_df = states_with_population_df.replace(
    "District of Columbia", "Washington DC"
)

In [81]:
subways_by_states_gdf = states_with_population_df.merge(
    count_df, on="NAME", how="inner"
)

In [82]:
subways_by_states_gdf["per_1000"] = subways_by_states_gdf["COUNT"] / (
    subways_by_states_gdf["POPULATION"] / 1000
)
subways_by_states_gdf["per_10k"] = subways_by_states_gdf["COUNT"] / (
    subways_by_states_gdf["POPULATION"] / 10_000
)
subways_by_states_gdf["per_100k"] = (
    (subways_by_states_gdf["COUNT"] / (subways_by_states_gdf["POPULATION"] / 100000))
    .round(decimals=0)
    .astype(int)
)
subways_by_states_gdf["per_500k"] = subways_by_states_gdf["COUNT"] / (
    subways_by_states_gdf["POPULATION"] / 500_000
)
subways_by_states_gdf["per_1m"] = (
    (subways_by_states_gdf["COUNT"] / (subways_by_states_gdf["POPULATION"] / 1_000_000))
    .round(decimals=0)
    .astype(int)
)

In [83]:
subways_by_states_gdf = subways_by_states_gdf.dropna()

In [84]:
subways_by_states_gdf = subways_by_states_gdf.to_crs(9311)
subways_by_states_gdf.to_file(f"data/stores_by_states.gpkg")