In [1]:
import json
import os

In [29]:
from bs4 import BeautifulSoup
import cloudscraper
import geopandas as gpd
import numpy as np
import pandas as pd

In [3]:
from shapely.geometry import Point
from tqdm.notebook import tqdm

## Get State Data

In [4]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

In [5]:
states_df = states_df[["STUSPS", "NAME", "geometry"]]

## Get Population data

In [6]:
state_populations = pd.read_excel(
    "data/NST-EST2023-POP.xlsx", sheet_name=None, engine="openpyxl"
)

In [7]:
state_populations_df = state_populations["NST-EST2023-POP"][
    [
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)",
        "Unnamed: 5",
    ]
]
state_populations_df = state_populations_df.rename(
    columns={
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)": "NAME",
        "Unnamed: 5": "POPULATION",
    }
)
state_populations_df["NAME"] = state_populations_df["NAME"].str[1:]

In [8]:
states_with_population_df = states_df.merge(state_populations_df, on="NAME", how="left")
states_with_population_df = states_with_population_df[
    ["STUSPS", "NAME", "POPULATION", "geometry"]
]

## Get Bojangles Data

In [9]:
scraper = cloudscraper.create_scraper()

In [12]:
store_list = []
url_base = "https://locations.bojangles.com/"
for i in tqdm(range(len(states_df)), desc="Parsing States"):
    # Get State code
    state_store_list = []
    state_code = states_df.iloc[i]["STUSPS"].lower()

    # If File exists continue
    if os.path.isfile(f"data/states/{state_code}.gpkg"):
        continue

    # Get all locations in a state
    url = os.path.join(url_base, f"{state_code}.html")
    r = scraper.get(url)

    if r.status_code != 200:
        print(states_df.iloc[i]["NAME"])
        continue

    soup = BeautifulSoup(r.text, "html.parser")
    locations_as = soup.find_all(
        "a",
        {
            "class": "Link flex lg:inline-block w-full lg:w-fit items-center justify-between"
        },
    )

    # Parse all locations in a state
    for locations_a in tqdm(locations_as, desc=f"Parsing Locations In {state_code}"):
        href = locations_a.attrs["href"]
        span = locations_a.find("span").attrs
        location_url = url_base + href

        r = scraper.get(location_url)
        location_soup = BeautifulSoup(r.text, "html.parser")
        details_divs = location_soup.find_all(
            "div",
            {
                "class": "flex gap-2 p-4 justify-center items-center flex-1 rounded-br-[10px] border-t border-l border-gray-300 bg-white hover:bg-brand-gray-200"
            },
        )

        # Parse through all stores
        for detail_div in details_divs:
            details_a = detail_div.find(
                "a",
                {
                    "class": "Link link-underline font-normal text-[12px] leading-normal font-secondary text-black"
                },
            )
            store_param = details_a.attrs["href"]
            store_url = store_param.replace("../", url_base)

            r = scraper.get(store_url)
            store_soup = BeautifulSoup(r.text, "html.parser")
            store_info_dict = json.loads(
                store_soup.find_all("script", {"type": "application/ld+json"})[0].text
            )
            coords = store_info_dict["@graph"][0]["geo"]
            point = Point(coords["longitude"], coords["latitude"])
            store_dict = {
                "STATE": state_code,
                "geometry": point,
            }
            store_list.append(store_dict)
            state_store_list.append(store_dict)

    if state_store_list:
        bojangles_state_gdf = gpd.GeoDataFrame(state_store_list, crs=4326)
        bojangles_state_gdf.to_file(f"data/states/{state_code}.gpkg")

bojangles_state_gdf = gpd.GeoDataFrame(store_list, crs=4326)
bojangles_state_gdf.to_file(f"data/stores.gpkg")

Parsing States:   0%|          | 0/56 [00:00<?, ?it/s]

Parsing Locations In nc:   0%|          | 0/193 [00:00<?, ?it/s]

Oklahoma


Parsing Locations In va:   0%|          | 0/55 [00:00<?, ?it/s]

Parsing Locations In wv:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations In la:   0%|          | 0/4 [00:00<?, ?it/s]

Michigan
Massachusetts
Idaho


Parsing Locations In fl:   0%|          | 0/7 [00:00<?, ?it/s]

Nebraska
Washington
New Mexico
Puerto Rico
South Dakota


Parsing Locations In tx:   0%|          | 0/11 [00:00<?, ?it/s]

California


Parsing Locations In al:   0%|          | 0/29 [00:00<?, ?it/s]

Parsing Locations In ga:   0%|          | 0/84 [00:00<?, ?it/s]

Parsing Locations In pa:   0%|          | 0/2 [00:00<?, ?it/s]

Missouri
Colorado
Utah


Parsing Locations In tn:   0%|          | 0/51 [00:00<?, ?it/s]

Wyoming
New York
Kansas
Alaska


Parsing Locations In nv:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations In il:   0%|          | 0/3 [00:00<?, ?it/s]

Vermont
Montana
Iowa


Parsing Locations In sc:   0%|          | 0/92 [00:00<?, ?it/s]

New Hampshire
Arizona
District of Columbia
American Samoa
United States Virgin Islands
New Jersey


Parsing Locations In md:   0%|          | 0/1 [00:00<?, ?it/s]

Maine
Hawaii
Delaware
Guam
Commonwealth of the Northern Mariana Islands
Rhode Island


Parsing Locations In ky:   0%|          | 0/7 [00:00<?, ?it/s]

Parsing Locations In oh:   0%|          | 0/4 [00:00<?, ?it/s]

Wisconsin
Oregon
North Dakota


Parsing Locations In ar:   0%|          | 0/2 [00:00<?, ?it/s]

Indiana
Minnesota
Connecticut


#### If the above cells breaks just keep running until complete. Then run the line below

In [16]:
bojangles_state_gdfs = []
states_path = "data/states"
for file in os.listdir("data/states"):
    constructed_path = os.path.join(states_path, file)
    if constructed_path.endswith(".gpkg"):
        state_bojangle_gdf = gpd.read_file(constructed_path)
        bojangles_state_gdfs.append(state_bojangle_gdf)

In [17]:
bojangle_state_gdf = gpd.GeoDataFrame(
    pd.concat(bojangles_state_gdfs, ignore_index=True)
)

In [18]:
bojangle_state_gdf = bojangle_state_gdf.to_crs(9311)
bojangle_state_gdf.to_file(f"data/stores.gpkg")

## Combine With States

In [19]:
bojangle_state_counts_series = bojangle_state_gdf.groupby("STATE").size()
bojangles_state_counts_df = pd.DataFrame(
    {
        "STUSPS": bojangle_state_counts_series.index,
        "stores": bojangle_state_counts_series.values,
    }
)

In [20]:
bojangles_state_counts_df["STUSPS"] = bojangles_state_counts_df["STUSPS"].str.upper()

In [21]:
bojangles_state_counts_df = states_with_population_df.merge(
    bojangles_state_counts_df, on="STUSPS", how="left"
)

In [23]:
bojangles_state_counts_df = bojangles_state_counts_df.fillna(0)
bojangles_state_counts_df["stores"] = bojangles_state_counts_df["stores"].astype(int)

In [24]:
bojangles_state_counts_df["per_1000"] = bojangles_state_counts_df["stores"] / (
    bojangles_state_counts_df["POPULATION"] / 1000
)
bojangles_state_counts_df["per_10k"] = bojangles_state_counts_df["stores"] / (
    bojangles_state_counts_df["POPULATION"] / 10_000
)
bojangles_state_counts_df["per_100k"] = bojangles_state_counts_df["stores"] / (
    bojangles_state_counts_df["POPULATION"] / 100000
)
bojangles_state_counts_df["per_500k"] = bojangles_state_counts_df["stores"] / (
    bojangles_state_counts_df["POPULATION"] / 500_000
)
bojangles_state_counts_df["per_1m"] = bojangles_state_counts_df["stores"] / (
    bojangles_state_counts_df["POPULATION"] / 1_000_000
)
bojangles_state_counts_df["people_per_store"] = (
    bojangles_state_counts_df["POPULATION"] / bojangles_state_counts_df["stores"]
)

In [33]:
bojangles_state_counts_df.replace([np.inf, -np.inf], 0, inplace=True)

In [36]:
bojangles_state_counts_df["people_per_store"] = bojangles_state_counts_df[
    "people_per_store"
].astype(int)

In [37]:
bojangles_state_counts_df = bojangles_state_counts_df.dropna()

In [38]:
bojangles_state_counts_df = bojangles_state_counts_df.to_crs(9311)
bojangles_state_counts_df.to_file(f"data/stores_by_states.gpkg")