In [11]:
import os
import random
import time

In [12]:
from bs4 import BeautifulSoup
import cloudscraper
import geopandas as gpd
import numpy as np
import pandas as pd

In [13]:
from shapely.geometry import Point
from tqdm.notebook import tqdm

## Get State Data

In [14]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

In [15]:
states_df = states_df[["STUSPS", "NAME", "geometry"]]

## Get Population data

In [16]:
state_populations = pd.read_excel(
    "data/NST-EST2024-POP.xlsx", sheet_name=None, engine="openpyxl"
)

In [17]:
state_populations_df = state_populations["NST-EST2024-POP"][
    [
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)",
        "Unnamed: 5",
    ]
]
state_populations_df = state_populations_df.rename(
    columns={
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)": "NAME",
        "Unnamed: 5": "POPULATION",
    }
)
state_populations_df["NAME"] = state_populations_df["NAME"].str[1:]

In [18]:
states_with_population_df = states_df.merge(state_populations_df, on="NAME", how="left")
states_with_population_df = states_with_population_df[
    ["STUSPS", "NAME", "POPULATION", "geometry"]
]

## Get Hardees Data

In [19]:
scraper = cloudscraper.create_scraper()

In [21]:
store_list = []
url_base = "https://locations.hardees.com/"
r = scraper.get(url_base)
states_soup = BeautifulSoup(r.text, "html.parser")
states_a = states_soup.find_all("a", {"class": "Directory-listLink"})
for state_a in tqdm(states_a, desc="Parsing States"):
    state_name = state_a.text
    state_url = state_a.attrs["href"]
    state_code = states_with_population_df[
        states_with_population_df["NAME"] == state_name
    ].iloc[0]["STUSPS"]

    # Get State code
    state_store_list = []

    # If File exists continue
    if os.path.isfile(f"data/states/{state_code}.gpkg"):
        continue

    # Get all locations in a state
    r_state_locations = scraper.get(state_url)

    if r_state_locations.status_code != 200:
        print(state_code)
        continue

    state_locations_soup = BeautifulSoup(r_state_locations.text, "html.parser")
    locations_as = state_locations_soup.find_all("a", {"class": "Directory-listLink"})

    # Parse all locations in a state
    for locations_a in tqdm(locations_as, desc=f"Parsing Locations In {state_code}"):
        location_url = locations_a.attrs["href"]

        r_location = scraper.get(location_url)
        location_soup = BeautifulSoup(r_location.text, "html.parser")
        store_as = location_soup.find_all("a", {"class": "Teaser-ctaLink"})

        # Parse through all stores
        for store_a in store_as:
            store_url = store_a.attrs["href"]
            r_store = scraper.get(store_url)
            store_soup = BeautifulSoup(r_store.text, "html.parser")

            try:
                lat = float(
                    store_soup.find(
                        "meta", {"property": "place:location:latitude"}
                    ).attrs["content"]
                )
                lon = float(
                    store_soup.find(
                        "meta", {"property": "place:location:longitude"}
                    ).attrs["content"]
                )
            except Exception as e:
                print(e)
                print(store_url)
            point = Point(lon, lat)
            store_dict = {
                "STATE": state_code,
                "geometry": point,
            }
            store_list.append(store_dict)
            state_store_list.append(store_dict)
            time.sleep(random.uniform(0.01, 0.5))
    if state_store_list:
        hardees_state_gdf = gpd.GeoDataFrame(state_store_list, crs=4326)
        hardees_state_gdf.to_file(f"data/states/{state_code}.gpkg")

Parsing States:   0%|          | 0/31 [00:00<?, ?it/s]

Parsing Locations In AL:   0%|          | 0/78 [00:00<?, ?it/s]

Parsing Locations In AR:   0%|          | 0/20 [00:00<?, ?it/s]

Parsing Locations In DE:   0%|          | 0/9 [00:00<?, ?it/s]

Parsing Locations In FL:   0%|          | 0/73 [00:00<?, ?it/s]

Parsing Locations In IN:   0%|          | 0/52 [00:00<?, ?it/s]

Parsing Locations In KS:   0%|          | 0/8 [00:00<?, ?it/s]

Parsing Locations In KY:   0%|          | 0/62 [00:00<?, ?it/s]

Parsing Locations In LA:   0%|          | 0/3 [00:00<?, ?it/s]

Parsing Locations In MD:   0%|          | 0/13 [00:00<?, ?it/s]

Parsing Locations In MI:   0%|          | 0/15 [00:00<?, ?it/s]

Parsing Locations In MN:   0%|          | 0/32 [00:00<?, ?it/s]

Parsing Locations In MO:   0%|          | 0/55 [00:00<?, ?it/s]

Parsing Locations In MS:   0%|          | 0/34 [00:00<?, ?it/s]

Parsing Locations In MT:   0%|          | 0/5 [00:00<?, ?it/s]

Parsing Locations In NC:   0%|          | 0/170 [00:00<?, ?it/s]

Parsing Locations In ND:   0%|          | 0/9 [00:00<?, ?it/s]

Parsing Locations In NE:   0%|          | 0/6 [00:00<?, ?it/s]

Parsing Locations In NY:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations In OH:   0%|          | 0/34 [00:00<?, ?it/s]

Parsing Locations In OK:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations In PA:   0%|          | 0/14 [00:00<?, ?it/s]

Parsing Locations In SC:   0%|          | 0/92 [00:00<?, ?it/s]

Parsing Locations In SD:   0%|          | 0/6 [00:00<?, ?it/s]

Parsing Locations In TN:   0%|          | 0/108 [00:00<?, ?it/s]

Parsing Locations In VA:   0%|          | 0/110 [00:00<?, ?it/s]

Parsing Locations In WI:   0%|          | 0/31 [00:00<?, ?it/s]

Parsing Locations In WV:   0%|          | 0/20 [00:00<?, ?it/s]

Parsing Locations In WY:   0%|          | 0/2 [00:00<?, ?it/s]

#### If the above cells breaks just keep running until complete. Then run the line below

In [22]:
hardees_state_gdfs = []
states_path = "data/states"
for file in os.listdir("data/states"):
    constructed_path = os.path.join(states_path, file)
    if constructed_path.endswith(".gpkg"):
        state_hardees_gdf = gpd.read_file(constructed_path)
        hardees_state_gdfs.append(state_hardees_gdf)

In [23]:
hardees_state_gdf = gpd.GeoDataFrame(pd.concat(hardees_state_gdfs, ignore_index=True))

In [24]:
hardees_state_gdf = hardees_state_gdf.to_crs(9311)
hardees_state_gdf.to_file(f"data/hardees.gpkg")

## Combine With States

In [25]:
hardees_state_counts_series = hardees_state_gdf.groupby("STATE").size()
hardees_state_state_counts_df = pd.DataFrame(
    {
        "STUSPS": hardees_state_counts_series.index,
        "stores": hardees_state_counts_series.values,
    }
)

In [26]:
hardees_state_state_counts_df["STUSPS"] = hardees_state_state_counts_df[
    "STUSPS"
].str.upper()

In [27]:
hardees_state_state_counts_gdf = states_with_population_df.merge(
    hardees_state_state_counts_df, on="STUSPS", how="left"
)

In [28]:
hardees_state_state_counts_gdf = hardees_state_state_counts_gdf.fillna(0)
hardees_state_state_counts_gdf["stores"] = hardees_state_state_counts_gdf[
    "stores"
].astype(int)

In [29]:
hardees_state_state_counts_gdf["per_1000"] = hardees_state_state_counts_gdf[
    "stores"
] / (hardees_state_state_counts_gdf["POPULATION"] / 1000)
hardees_state_state_counts_gdf["per_10k"] = hardees_state_state_counts_gdf["stores"] / (
    hardees_state_state_counts_gdf["POPULATION"] / 10_000
)
hardees_state_state_counts_gdf["per_100k"] = (
    hardees_state_state_counts_gdf["stores"]
    / (hardees_state_state_counts_gdf["POPULATION"] / 100000)
).round(decimals=2)

hardees_state_state_counts_gdf["per_500k"] = (
    hardees_state_state_counts_gdf["stores"]
    / (hardees_state_state_counts_gdf["POPULATION"] / 500_000)
).round(decimals=2)

hardees_state_state_counts_gdf["per_1m"] = (
    hardees_state_state_counts_gdf["stores"]
    / (hardees_state_state_counts_gdf["POPULATION"] / 1_000_000)
).round(decimals=2)

hardees_state_state_counts_gdf["people_per_store"] = (
    hardees_state_state_counts_gdf["POPULATION"]
    / hardees_state_state_counts_gdf["stores"]
).round(decimals=2)

In [30]:
hardees_state_state_counts_gdf.replace([np.inf, -np.inf], 0, inplace=True)

In [31]:
hardees_state_state_counts_gdf = hardees_state_state_counts_gdf.dropna()

In [32]:
hardees_state_state_counts_gdf["people_per_store"] = hardees_state_state_counts_gdf[
    "people_per_store"
].astype(int)

In [33]:
hardees_state_state_counts_gdf = hardees_state_state_counts_gdf.dropna()

In [34]:
hardees_state_state_counts_gdf = hardees_state_state_counts_gdf.to_crs(9311)
hardees_state_state_counts_gdf.to_file(f"data/hardees_per_state.gpkg")