In [1]:
import os
import re
import random
import json
import time
import html

In [2]:
from urllib.parse import urljoin

In [3]:
import cloudscraper
from bs4 import BeautifulSoup
import geopandas as gpd
import pandas as pd

In [4]:
from shapely.geometry import Point
from tqdm.notebook import tqdm

## Get State Data

In [81]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

In [82]:
states_df = states_df[["STUSPS", "NAME", "geometry"]]

## Get Food Data

In [83]:
scraper = cloudscraper.create_scraper()

In [84]:
r = scraper.get("https://www.huddlehouse.com/locations/in/new-haven/3105-s-doyle-rd/")
soup = BeautifulSoup(r.text, "html.parser")
nuxt_script = soup.find("script", text=re.compile(r"^\s*window\.__NUXT__"))

  nuxt_script = soup.find("script", text=re.compile(r"^\s*window\.__NUXT__"))


In [85]:
js_code = nuxt_script.text
# Extract everything after "allLocations:"
match = re.search(r"allLocations\s*:\s*(.*)", js_code, re.DOTALL)
after_alllocations = match.group(1).strip()

In [86]:
pattern = re.compile(
    r'lat\s*:\s*([-]?\d+\.\d+)\s*,\s*lng\s*:\s*([-]?\d+\.\d+).*?path\s*:\s*"([^"]+)"',
    re.DOTALL,
)
results = pattern.findall(after_alllocations)

In [89]:
locations = [
    {
        "geometry": Point(-84.9801083278705, 41.05931176150077),
        "STUSPS": "IN",
        "URL": "https://www.huddlehouse.com/locations/in/new-haven/3105-s-doyle-rd",
    }
]
for lat, lng, path in results:
    state_code = path.split("/")[0].upper()
    url = os.path.join("https://www.huddlehouse.com/locations", path)
    # Andalusia is incorrect
    if path == "al/andalusia/1209-east-bypass":
        print("Fixed andalusia")
        point = Point(-86.45991768086215, 31.318963558489788)
    elif path == "sc/marion/2424-e-highway-76":
        print("Fixed marion")
        point = Point(-79.36882790814433, 34.1835977478983)
    else:
        point = Point(float(lng), float(lat))
    locations.append({"geometry": point, "STUSPS": state_code, "URL": url})

Fixed andalusia
Fixed marion


In [90]:
huddlehouse_gdf = gpd.GeoDataFrame(locations, crs=4326)

In [91]:
huddlehouse_gdf = huddlehouse_gdf.to_crs(9311)
huddlehouse_gdf.to_file(f"data/stores.gpkg")

## Get Population Data

In [92]:
state_populations = pd.read_excel(
    "data/NST-EST2024-POP.xlsx", sheet_name=None, engine="openpyxl"
)

In [93]:
state_populations_df = state_populations["NST-EST2024-POP"][
    [
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)",
        "Unnamed: 5",
    ]
]
state_populations_df = state_populations_df.rename(
    columns={
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)": "NAME",
        "Unnamed: 5": "POPULATION",
    }
)
state_populations_df["NAME"] = state_populations_df["NAME"].str[1:]

In [94]:
states_with_population_df = states_df.merge(state_populations_df, on="NAME", how="left")
states_with_population_df = states_with_population_df[
    ["STUSPS", "NAME", "POPULATION", "geometry"]
]

In [95]:
states_with_population_df = states_with_population_df.dropna()

## Merge Data

In [96]:
huddlehouse_count_df = pd.DataFrame(
    huddlehouse_gdf.groupby("STUSPS").size(), columns=["HHs"]
).reset_index()

In [97]:
huddlehouse_count_gdf = states_with_population_df.merge(
    huddlehouse_count_df, on="STUSPS", how="left"
)
huddlehouse_count_gdf = huddlehouse_count_gdf.fillna(0)

In [98]:
huddlehouse_count_gdf["per_100k"] = (
    huddlehouse_count_gdf["HHs"] / (huddlehouse_count_gdf["POPULATION"] / 100000)
).round(decimals=2)
huddlehouse_count_gdf["per_1m"] = (
    huddlehouse_count_gdf["HHs"] / (huddlehouse_count_gdf["POPULATION"] / 1_000_000)
).round(decimals=1)

In [99]:
huddlehouse_count_gdf = huddlehouse_count_gdf.to_crs(9311)

In [100]:
huddlehouse_count_gdf.to_file("data/Huddlehouse_Per_State.gpkg")

In [101]:
huddlehouse_count_gdf[["HHs", "per_100k", "per_1m", "STUSPS"]].sort_values(
    "per_100k", ascending=False
)

Unnamed: 0,HHs,per_100k,per_1m,STUSPS
17,39.0,0.76,7.6,AL
0,17.0,0.58,5.8,MS
18,63.0,0.57,5.7,GA
43,26.0,0.57,5.7,KY
33,17.0,0.32,3.2,SC
48,8.0,0.26,2.6,AR
23,17.0,0.24,2.4,TN
5,9.0,0.2,2.0,LA
20,12.0,0.19,1.9,MO
3,15.0,0.17,1.7,VA
