In [102]:
import json
import os
import random
import time

In [103]:
import cloudscraper
from bs4 import BeautifulSoup
import geopandas as gpd
import pandas as pd

In [104]:
from shapely.geometry import Point
from tqdm.notebook import tqdm

## Get State Data

In [105]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

In [43]:
states_df = states_df[["STUSPS", "NAME", "geometry"]]

## Get KK Data

In [110]:
scraper = cloudscraper.create_scraper()

In [111]:
r = scraper.get("https://site.krispykreme.com/index.html")
url_root = "https://site.krispykreme.com"
soup = BeautifulSoup(r.text, "html.parser")
state_lis = soup.find_all("li", {"class": "directory-list-item"})
for state_li in tqdm(state_lis, desc="Parsing Through States"):
    state_code = state_li.find("a")["href"].split("/")[1]
    state_url = os.path.join(url_root, state_code)
    state_store_list = []

    # If File exists continue
    if os.path.isfile(f"data/states/{state_code}.gpkg"):
        continue

    r_state = scraper.get(state_url)
    state_soup = BeautifulSoup(r_state.text, "html.parser")
    city_divs = state_soup.find_all("div", {"class": "directory-list-item"})

    for city_div in tqdm(city_divs, desc=f"Parsing Locations in {state_code}"):
        city_url_path = "/".join(city_div.find("a")["href"].split("/")[:2])
        city_url = os.path.join(url_root, city_url_path)
        r_city = scraper.get(city_url)
        city_soup = BeautifulSoup(r_city.text, "html.parser")
        store_divs = city_soup.find_all("div", {"class": "directory-link-wrapper"})
        for store_div in store_divs:
            store_url = store_div.find("a")["href"].replace("../", url_root)
            r_store = scraper.get(store_url)
            try:
                store_soup = BeautifulSoup(r_store.text, "html.parser")
                store_dict = json.loads(
                    store_soup.find("script", {"type": "application/ld+json"}).text
                )
                coords = store_dict["geo"]
                pt = Point(coords["longitude"], coords["latitude"])
                state_store_list.append({"STUSPS": state_code.upper(), "geometry": pt})
            except Exception as e:
                print(store_url, e)

            time.sleep(random.uniform(0.01, 0.5))

    if state_store_list:
        state_store_gdf = gpd.GeoDataFrame(state_store_list, crs=4326)
        state_store_gdf.to_file(f"data/states/{state_code}.gpkg")

Parsing Locations in al:   0%|          | 0/13 [00:00<?, ?it/s]

Parsing Locations in ak:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Locations in az:   0%|          | 0/6 [00:00<?, ?it/s]

Parsing Locations in ar:   0%|          | 0/3 [00:00<?, ?it/s]

Parsing Locations in ca:   0%|          | 0/39 [00:00<?, ?it/s]

Parsing Locations in co:   0%|          | 0/4 [00:00<?, ?it/s]

Parsing Locations in ct:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations in de:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations in fl:   0%|          | 0/24 [00:00<?, ?it/s]

Parsing Locations in ga:   0%|          | 0/25 [00:00<?, ?it/s]

Parsing Locations in hi:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations in id:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations in il:   0%|          | 0/10 [00:00<?, ?it/s]

Parsing Locations in ia:   0%|          | 0/3 [00:00<?, ?it/s]

Parsing Locations in in:   0%|          | 0/4 [00:00<?, ?it/s]

Parsing Locations in ks:   0%|          | 0/3 [00:00<?, ?it/s]

Parsing Locations in ky:   0%|          | 0/3 [00:00<?, ?it/s]

Parsing Locations in la:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Locations in md:   0%|          | 0/10 [00:00<?, ?it/s]

Parsing Locations in mi:   0%|          | 0/4 [00:00<?, ?it/s]

Parsing Locations in ms:   0%|          | 0/4 [00:00<?, ?it/s]

Parsing Locations in mo:   0%|          | 0/6 [00:00<?, ?it/s]

Parsing Locations in mt:   0%|          | 0/3 [00:00<?, ?it/s]

Parsing Locations in ne:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Locations in nv:   0%|          | 0/3 [00:00<?, ?it/s]

Parsing Locations in nj:   0%|          | 0/5 [00:00<?, ?it/s]

Parsing Locations in nm:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations in ny:   0%|          | 0/4 [00:00<?, ?it/s]

Parsing Locations in nc:   0%|          | 0/23 [00:00<?, ?it/s]

Parsing Locations in oh:   0%|          | 0/7 [00:00<?, ?it/s]

Parsing Locations in ok:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Locations in or:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Locations in pa:   0%|          | 0/6 [00:00<?, ?it/s]

Parsing Locations in sc:   0%|          | 0/14 [00:00<?, ?it/s]

Parsing Locations in tn:   0%|          | 0/13 [00:00<?, ?it/s]

Parsing Locations in tx:   0%|          | 0/18 [00:00<?, ?it/s]

Parsing Locations in ut:   0%|          | 0/6 [00:00<?, ?it/s]

Parsing Locations in va:   0%|          | 0/13 [00:00<?, ?it/s]

Parsing Locations in wa:   0%|          | 0/7 [00:00<?, ?it/s]

Parsing Locations in dc:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations in wv:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations in wi:   0%|          | 0/1 [00:00<?, ?it/s]

In [112]:
store_gdfs = []
states_path = "data/states"
for file in os.listdir("data/states"):
    constructed_path = os.path.join(states_path, file)
    if constructed_path.endswith(".gpkg"):
        state_gdf = gpd.read_file(constructed_path)
        store_gdfs.append(state_gdf)

In [113]:
kripsy_kreme_locations_gdf = gpd.GeoDataFrame(pd.concat(store_gdfs, ignore_index=True))

In [115]:
kripsy_kreme_locations_gdf.to_file("data/krispy_kreme_locations.gpkg")

In [116]:
kripsy_kreme_counts_df = (
    pd.DataFrame(kripsy_kreme_locations_gdf.groupby("STUSPS").size())
    .reset_index()
    .rename(columns={0: "KKs"})
)

## Get Population Data

In [117]:
state_populations = pd.read_excel(
    "data/NST-EST2024-POP.xlsx", sheet_name=None, engine="openpyxl"
)

In [118]:
state_populations_df = state_populations["NST-EST2024-POP"][
    [
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)",
        "Unnamed: 5",
    ]
]
state_populations_df = state_populations_df.rename(
    columns={
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)": "NAME",
        "Unnamed: 5": "POPULATION",
    }
)
state_populations_df["NAME"] = state_populations_df["NAME"].str[1:]

In [119]:
states_with_population_df = states_df.merge(state_populations_df, on="NAME", how="left")
states_with_population_df = states_with_population_df[
    ["STUSPS", "NAME", "POPULATION", "geometry"]
]

In [120]:
states_with_population_df = states_with_population_df.dropna()

## Merge Data

In [121]:
kripsy_kreme_counts_gdf = states_with_population_df.merge(
    kripsy_kreme_counts_df, on="STUSPS", how="left"
)
kripsy_kreme_counts_gdf = kripsy_kreme_counts_gdf.fillna(0)

In [122]:
kripsy_kreme_counts_gdf["per_100k"] = kripsy_kreme_counts_gdf["KKs"] / (
    kripsy_kreme_counts_gdf["POPULATION"] / 100000
)
kripsy_kreme_counts_gdf["per_1m"] = kripsy_kreme_counts_gdf["KKs"] / (
    kripsy_kreme_counts_gdf["POPULATION"] / 1_000_000
)

In [123]:
kripsy_kreme_counts_gdf = kripsy_kreme_counts_gdf.to_crs(9311)

In [124]:
kripsy_kreme_counts_gdf.to_file("data/krispy_kremes_per_state.gpkg")