In [1]:
import os
import re
import random
import json
import time
import html

In [2]:
from urllib.parse import urljoin

In [3]:
import cloudscraper
from bs4 import BeautifulSoup
import geopandas as gpd
import pandas as pd

In [4]:
from shapely.geometry import Point
from tqdm.notebook import tqdm

## Get State Data

In [5]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

In [6]:
states_df = states_df[["STUSPS", "NAME", "geometry"]]

## Get Food Data

In [7]:
scraper = cloudscraper.create_scraper()

In [9]:
def get_coords(store_url: str) -> Point:
    r = scraper.get(store_url)
    soup = BeautifulSoup(r.text, "html.parser")
    raw = soup.find("script", {"type": "application/ld+json"}).text.strip()
    clean = html.unescape(raw)
    clean = re.sub(r"[\x00-\x1f]+", " ", clean)

    # Try to trim trailing braces until JSON parses
    while True:
        try:
            data = json.loads(clean)
            break  # success
        except json.JSONDecodeError as e:
            if clean.endswith("}"):
                clean = clean[:-1].rstrip()
            else:
                raise e

    lat = round(float(data["geo"]["latitude"]), 6)
    lon = round(float(data["geo"]["longitude"]), 6)
    return Point(lon, lat)

In [10]:
def find_cities(url: str) -> list:
    base_url = "https://locations.deltaco.com"
    r = scraper.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    city_divs = soup.find_all("div", {"class": "city-name"})
    city_urls = [
        base_url + "/".join(city_div.find("a").attrs["href"].split("/")[:4])
        for city_div in city_divs
    ]
    return city_urls

In [11]:
def find_stores(url: str) -> list:
    base_url = "https://locations.deltaco.com"
    r = scraper.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    store_divs = soup.find_all("div", {"class": "store-name"})
    store_urls = [
        base_url + "/".join(store_div.find("a").attrs["href"].split("/")[:5])
        for store_div in store_divs
    ]
    return store_urls

In [14]:
state_href

'/us/sc'

In [15]:
r = scraper.get("https://locations.deltaco.com/us")
soup = BeautifulSoup(r.text, "html.parser")
state_divs = soup.find_all(
    "div", {"class": "city-name col-6 col-sm-4 col-md-3 col-lg-2"}
)
state_hrefs = [
    "/".join(state_div.find("a").attrs["href"].split("/")[:3])
    for state_div in state_divs
]


# Parse all locations in a state
for state_href in tqdm(state_hrefs, desc="Parsing States"):
    state_store_list = []
    state_code = state_href.split("/")[-1].upper()

    # If File exists continue
    if os.path.isfile(f"data/states/{state_code}.gpkg"):
        continue

    state_url = "https://locations.deltaco.com" + state_href

    city_urls = find_cities(state_url)

    if not city_urls:
        print(state_code, state_url)
        continue

    for city_url in tqdm(city_urls, desc=f"Parsing Locations in {state_code}"):
        store_urls = find_stores(city_url)
        if not store_urls:
            print(city_url)
            continue

        for store_url in store_urls:
            try:
                point = get_coords(store_url)
            except Exception as e:
                print(store_url, e)
            store_dict = {"STUSPS": state_code, "geometry": point, "url": store_url}
            state_store_list.append(store_dict)
            time.sleep(random.uniform(0.01, 0.5))

    if state_store_list:
        store_gdf = gpd.GeoDataFrame(state_store_list, crs=4326)
        store_gdf.to_file(f"data/states/{state_code}.gpkg")

Parsing States:   0%|          | 0/19 [00:00<?, ?it/s]

Parsing Locations in AL:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Locations in AZ:   0%|          | 0/21 [00:00<?, ?it/s]

Parsing Locations in CA:   0%|          | 0/179 [00:00<?, ?it/s]

Parsing Locations in CO:   0%|          | 0/15 [00:00<?, ?it/s]

Parsing Locations in FL:   0%|          | 0/13 [00:00<?, ?it/s]

Parsing Locations in GA:   0%|          | 0/12 [00:00<?, ?it/s]

Parsing Locations in ID:   0%|          | 0/7 [00:00<?, ?it/s]

Parsing Locations in MI:   0%|          | 0/9 [00:00<?, ?it/s]

Parsing Locations in MS:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations in NV:   0%|          | 0/8 [00:00<?, ?it/s]

Parsing Locations in NM:   0%|          | 0/7 [00:00<?, ?it/s]

Parsing Locations in NC:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Locations in OH:   0%|          | 0/3 [00:00<?, ?it/s]

Parsing Locations in OK:   0%|          | 0/6 [00:00<?, ?it/s]

Parsing Locations in OR:   0%|          | 0/7 [00:00<?, ?it/s]

Parsing Locations in SC:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations in UT:   0%|          | 0/29 [00:00<?, ?it/s]

Parsing Locations in VA:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Locations in WA:   0%|          | 0/7 [00:00<?, ?it/s]

In [16]:
store_gdfs = []
states_path = "data/states"
for file in os.listdir("data/states"):
    constructed_path = os.path.join(states_path, file)
    if constructed_path.endswith(".gpkg"):
        del_taco_gdf = gpd.read_file(constructed_path)
        store_gdfs.append(del_taco_gdf)

In [17]:
del_taco_gdf = pd.concat(store_gdfs, ignore_index=True)

In [18]:
del_taco_gdf.head()

Unnamed: 0,STUSPS,url,geometry
0,AL,https://locations.deltaco.com/us/al/huntsville...,POINT (-86.67673 34.70951)
1,AL,https://locations.deltaco.com/us/al/phenix-cit...,POINT (-85.02818 32.47034)
2,AZ,https://locations.deltaco.com/us/az/ash-fork/9...,POINT (-112.49748 35.22275)
3,AZ,https://locations.deltaco.com/us/az/avondale/1...,POINT (-112.34041 33.45439)
4,AZ,https://locations.deltaco.com/us/az/bullhead-c...,POINT (-114.5925 35.12012)


In [20]:
del_taco_gdf = del_taco_gdf.to_crs(9311)
del_taco_gdf.to_file(f"data/stores.gpkg")

## Get Population Data

In [21]:
state_populations = pd.read_excel(
    "data/NST-EST2024-POP.xlsx", sheet_name=None, engine="openpyxl"
)

In [22]:
state_populations_df = state_populations["NST-EST2024-POP"][
    [
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)",
        "Unnamed: 5",
    ]
]
state_populations_df = state_populations_df.rename(
    columns={
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)": "NAME",
        "Unnamed: 5": "POPULATION",
    }
)
state_populations_df["NAME"] = state_populations_df["NAME"].str[1:]

In [23]:
states_with_population_df = states_df.merge(state_populations_df, on="NAME", how="left")
states_with_population_df = states_with_population_df[
    ["STUSPS", "NAME", "POPULATION", "geometry"]
]

In [24]:
states_with_population_df = states_with_population_df.dropna()

## Merge Data

In [25]:
del_taco_count_df = pd.DataFrame(
    del_taco_gdf.groupby("STUSPS").size(), columns=["DTs"]
).reset_index()

In [26]:
del_taco_count_gdf = states_with_population_df.merge(
    del_taco_count_df, on="STUSPS", how="left"
)
del_taco_count_gdf = del_taco_count_gdf.fillna(0)

In [29]:
del_taco_count_gdf["per_100k"] = (
    del_taco_count_gdf["DTs"] / (del_taco_count_gdf["POPULATION"] / 100000)
).round(decimals=2)
del_taco_count_gdf["per_1m"] = (
    del_taco_count_gdf["DTs"] / (del_taco_count_gdf["POPULATION"] / 1_000_000)
).round(decimals=1)

In [30]:
del_taco_count_gdf = del_taco_count_gdf.to_crs(9311)

In [31]:
del_taco_count_gdf.to_file("data/Del_Taco_Per_State.gpkg")

In [32]:
del_taco_count_gdf[["DTs", "per_100k", "per_1m", "STUSPS"]].sort_values(
    "per_100k", ascending=False
)

Unnamed: 0,DTs,per_100k,per_1m,STUSPS
28,44.0,1.37,13.7,NV
22,37.0,1.07,10.7,UT
16,350.0,0.89,8.9,CA
8,10.0,0.51,5.1,ID
35,38.0,0.51,5.1,AZ
12,10.0,0.47,4.7,NM
21,19.0,0.32,3.2,CO
2,9.0,0.22,2.2,OK
46,8.0,0.19,1.9,OR
18,13.0,0.12,1.2,GA
