In [1]:
import os
import re
import random
import time

In [2]:
from urllib.parse import urljoin

In [3]:
import cloudscraper
from bs4 import BeautifulSoup
import geopandas as gpd
import pandas as pd

In [4]:
from shapely.geometry import Point
from tqdm.notebook import tqdm

## Get State Data

In [5]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

In [6]:
states_df = states_df[["STUSPS", "NAME", "geometry"]]

## Get Chipotle Data

In [7]:
scraper = cloudscraper.create_scraper()

In [8]:
def get_coords(store_url: str) -> Point:
    r = scraper.get(store_url)
    soup = BeautifulSoup(r.text, "html.parser")
    lat = float(soup.find("meta", {"itemprop": "latitude"}).attrs["content"])
    lon = float(soup.find("meta", {"itemprop": "longitude"}).attrs["content"])
    return Point(lon, lat)

In [10]:
url = "https://locations.chipotle.com"
r = scraper.get(url)
soup = BeautifulSoup(r.text, "html.parser")
state_as = soup.find_all("a", {"class": "Directory-listLink"})

# Parse all locations in a state
for state_a in tqdm(state_as, desc="Parsing States"):
    state_store_list = []
    state_code = state_a.attrs["href"].split("/")[0]
    state_url = os.path.join(url, state_code)

    # If File exists continue
    if os.path.isfile(f"data/states/{state_code}.gpkg"):
        continue

    r_state = scraper.get(state_url)

    if r_state.status_code != 200:
        print(state_code)
        continue

    state_soup = BeautifulSoup(r_state.text, "html.parser")
    city_as = state_soup.find_all("a", {"class": "Directory-listLink"})
    city_urls = [
        os.path.join(url, "/".join(city_a.attrs["href"].split("/")[:2]))
        for city_a in city_as
    ]

    for city_url in tqdm(city_urls, desc=f"Parsing Locations in {state_code}"):
        r_city = scraper.get(city_url)
        city_soup = BeautifulSoup(r_city.text, "html.parser")
        store_as = city_soup.find_all("a", {"class": "Teaser-titleLink"})
        store_urls = [store_a.attrs["href"].replace("..", url) for store_a in store_as]
        for store_url in store_urls:
            try:
                point = get_coords(store_url)
            except Exception as e:
                print(store_url, e)
            store_dict = {"STUSPS": state_code, "geometry": point}
            state_store_list.append(store_dict)
            time.sleep(random.uniform(0.01, 0.5))

    if state_store_list:
        store_gdf = gpd.GeoDataFrame(state_store_list, crs=4326)
        store_gdf.to_file(f"data/states/{state_code}.gpkg")

Parsing States:   0%|          | 0/49 [00:00<?, ?it/s]

Parsing Locations in nd:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations in sd:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations in dc:   0%|          | 0/1 [00:00<?, ?it/s]

## Manually Add 1 in FL that wasn't picked up

In [22]:
point = get_coords("https://locations.chipotle.com/fl/lutz/25442-sierra-center-blvd")
store_fl_gdf = gpd.GeoDataFrame([{"geometry": point, "STUSPS": "fl"}], crs=4326)

In [23]:
store_gdfs = []
states_path = "data/states"
for file in os.listdir("data/states"):
    constructed_path = os.path.join(states_path, file)
    if constructed_path.endswith(".gpkg"):
        chipotle_gdf = gpd.read_file(constructed_path)
        store_gdfs.append(chipotle_gdf)

In [24]:
chipotle_gdf = pd.concat([*store_gdfs, store_fl_gdf], ignore_index=True)

In [25]:
chipotle_gdf

Unnamed: 0,STUSPS,geometry
0,al,POINT (-86.94589 34.78536)
1,al,POINT (-85.4452 32.62501)
2,al,POINT (-85.48733 32.60681)
3,al,POINT (-86.80276 33.50972)
4,al,POINT (-86.64744 33.59558)
...,...,...
3821,wv,POINT (-80.55162 40.39881)
3822,wv,POINT (-80.1967 39.42415)
3823,wy,POINT (-104.8057 41.16072)
3824,wy,POINT (-105.55944 41.30939)


In [26]:
chipotle_gdf = chipotle_gdf.to_crs(9311)
chipotle_gdf.to_file(f"data/stores.gpkg")

## Get Population Data

In [27]:
state_populations = pd.read_excel(
    "data/NST-EST2024-POP.xlsx", sheet_name=None, engine="openpyxl"
)

In [28]:
state_populations_df = state_populations["NST-EST2024-POP"][
    [
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)",
        "Unnamed: 5",
    ]
]
state_populations_df = state_populations_df.rename(
    columns={
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)": "NAME",
        "Unnamed: 5": "POPULATION",
    }
)
state_populations_df["NAME"] = state_populations_df["NAME"].str[1:]

In [29]:
states_with_population_df = states_df.merge(state_populations_df, on="NAME", how="left")
states_with_population_df = states_with_population_df[
    ["STUSPS", "NAME", "POPULATION", "geometry"]
]

In [30]:
states_with_population_df = states_with_population_df.dropna()

## Merge Data

In [53]:
chipotle_counts_df = pd.DataFrame(
    chipotle_gdf.groupby("STUSPS").size(), columns=["Chipotles"]
).reset_index()

In [54]:
chipotle_counts_df["STUSPS"] = chipotle_counts_df["STUSPS"].str.upper()

In [55]:
chipotle_counts_gdf = states_with_population_df.merge(
    chipotle_counts_df, on="STUSPS", how="left"
)
chipotle_counts_gdf = chipotle_counts_gdf.fillna(0)

In [56]:
chipotle_counts_gdf["per_100k"] = (
    chipotle_counts_gdf["Chipotles"] / (chipotle_counts_gdf["POPULATION"] / 100000)
).round(decimals=2)
chipotle_counts_gdf["per_1m"] = (
    chipotle_counts_gdf["Chipotles"] / (chipotle_counts_gdf["POPULATION"] / 1_000_000)
).round(decimals=1)

In [57]:
chipotle_counts_gdf = chipotle_counts_gdf.to_crs(9311)

In [58]:
chipotle_counts_gdf.to_file("data/Chipotle_Per_State.gpkg")

In [59]:
chipotle_counts_gdf[["per_100k", "per_1m", "STUSPS"]].sort_values(
    "per_100k", ascending=False
)

Unnamed: 0,per_100k,per_1m,STUSPS
36,3.06,30.6,DC
44,2.23,22.3,OH
38,1.95,19.5,MD
3,1.6,16.0,VA
21,1.53,15.3,CO
50,1.51,15.1,MN
29,1.42,14.2,IL
35,1.42,14.2,AZ
28,1.37,13.7,NV
42,1.36,13.6,RI
