In [1]:
import os
import re
import random
import json
import time

In [2]:
from urllib.parse import urljoin, unquote

In [3]:
import cloudscraper
from bs4 import BeautifulSoup
import geopandas as gpd
import pandas as pd

In [4]:
from shapely.geometry import Point
from tqdm.notebook import tqdm

## Get State Data

In [5]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

In [6]:
states_df = states_df[["STUSPS", "NAME", "geometry"]]

## Get Data

In [7]:
scraper = cloudscraper.create_scraper()

In [13]:
def get_store_dict(store_url: str) -> dict:
    r = scraper.get(store_url)
    soup = BeautifulSoup(r.text, "html.parser")
    text = soup.find("script", {"type": "module"}).text
    match = re.search(r'decodeURIComponent\("([^"]+)"\)', text)
    if match:
        encoded = match.group(1)
    else:
        raise ValueError("Could not find encoded JSON")
    decoded = json.loads(unquote(encoded))
    coordinate = decoded["document"]["yextDisplayCoordinate"]
    point = Point(float(coordinate["longitude"]), float(coordinate["latitude"]))
    site_id = decoded["document"]["siteId"]
    STUSPS = decoded["document"]["address"]["region"]
    return {"geometry": point, "STUSPS": STUSPS, "ID": site_id, "URL": store_url}

In [14]:
def get_stores(url: str) -> list:
    r = scraper.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    divs = soup.find_all("div", {"class": "locator-card-actions"})
    return [
        div.find("a").attrs["href"].replace("..", "https://locations.raisingcanes.com/")
        for div in divs
    ]

In [15]:
def get_urls(url: str) -> list:
    r = scraper.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    lis = soup.find_all("li", {"class": "p-3"})
    urls = [directory_url + li.find("a").attrs["href"] for li in lis]
    return urls

In [16]:
directory_url = "https://locations.raisingcanes.com/"
state_urls = get_urls(directory_url)

# Parse all locations in a state
for state_url in tqdm(state_urls, desc="Parsing States"):
    state_store_list = []
    state_code = state_url.split("/")[-1].upper()

    # If File exists continue
    if os.path.isfile(f"data/states/{state_code}.gpkg"):
        continue

    city_urls = get_urls(state_url)

    if not city_urls:
        print(state_code)
        continue

    for city_url in tqdm(city_urls, desc=f"Parsing Locations in {state_code}"):
        store_urls = get_stores(city_url)
        if not store_urls:
            print("Error getting city url", city_url)
            continue

        for store_url in store_urls:
            try:
                store_dict = get_store_dict(store_url)
            except Exception as e:
                print("Error getting store url", store_url, str(e))
                continue
            state_store_list.append(store_dict)
            time.sleep(random.uniform(0.01, 0.5))

    if state_store_list:
        store_gdf = gpd.GeoDataFrame(state_store_list, crs=4326)
        store_gdf.to_file(f"data/states/{state_code}.gpkg")

Parsing States:   0%|          | 0/46 [00:00<?, ?it/s]

Parsing Locations in AL:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Locations in AK:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Locations in AZ:   0%|          | 0/19 [00:00<?, ?it/s]

Parsing Locations in AR:   0%|          | 0/6 [00:00<?, ?it/s]

Parsing Locations in CA:   0%|          | 0/106 [00:00<?, ?it/s]

Parsing Locations in CO:   0%|          | 0/15 [00:00<?, ?it/s]

Parsing Locations in CT:   0%|          | 0/5 [00:00<?, ?it/s]

Parsing Locations in DE:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Locations in DC:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations in FL:   0%|          | 0/22 [00:00<?, ?it/s]

Error getting city url https://locations.raisingcanes.com/undefined


Parsing Locations in GA:   0%|          | 0/8 [00:00<?, ?it/s]

Parsing Locations in GU:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations in HI:   0%|          | 0/6 [00:00<?, ?it/s]

Parsing Locations in ID:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations in IL:   0%|          | 0/38 [00:00<?, ?it/s]

Parsing Locations in IN:   0%|          | 0/12 [00:00<?, ?it/s]

Parsing Locations in IA:   0%|          | 0/11 [00:00<?, ?it/s]

Parsing Locations in KS:   0%|          | 0/8 [00:00<?, ?it/s]

Parsing Locations in KY:   0%|          | 0/9 [00:00<?, ?it/s]

Parsing Locations in LA:   0%|          | 0/36 [00:00<?, ?it/s]

Parsing Locations in MD:   0%|          | 0/10 [00:00<?, ?it/s]

Parsing Locations in MA:   0%|          | 0/9 [00:00<?, ?it/s]

Parsing Locations in MI:   0%|          | 0/4 [00:00<?, ?it/s]

Parsing Locations in MN:   0%|          | 0/15 [00:00<?, ?it/s]

Parsing Locations in MS:   0%|          | 0/14 [00:00<?, ?it/s]

Parsing Locations in MO:   0%|          | 0/21 [00:00<?, ?it/s]

Parsing Locations in NE:   0%|          | 0/8 [00:00<?, ?it/s]

Parsing Locations in NV:   0%|          | 0/5 [00:00<?, ?it/s]

Parsing Locations in NH:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations in NJ:   0%|          | 0/7 [00:00<?, ?it/s]

Parsing Locations in NM:   0%|          | 0/3 [00:00<?, ?it/s]

Parsing Locations in NY:   0%|          | 0/5 [00:00<?, ?it/s]

Error getting city url https://locations.raisingcanes.com/undefined


Parsing Locations in NC:   0%|          | 0/3 [00:00<?, ?it/s]

Parsing Locations in OH:   0%|          | 0/49 [00:00<?, ?it/s]

Parsing Locations in OK:   0%|          | 0/12 [00:00<?, ?it/s]

Parsing Locations in OR:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations in PA:   0%|          | 0/14 [00:00<?, ?it/s]

Parsing Locations in RI:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Locations in SC:   0%|          | 0/4 [00:00<?, ?it/s]

Parsing Locations in TN:   0%|          | 0/11 [00:00<?, ?it/s]

Parsing Locations in TX:   0%|          | 0/113 [00:00<?, ?it/s]

Parsing Locations in UT:   0%|          | 0/12 [00:00<?, ?it/s]

Parsing Locations in VA:   0%|          | 0/14 [00:00<?, ?it/s]

Parsing Locations in WA:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Locations in WI:   0%|          | 0/7 [00:00<?, ?it/s]

Parsing Locations in WV:   0%|          | 0/1 [00:00<?, ?it/s]

In [36]:
store_gdfs = []
states_path = "data/states"
for file in os.listdir("data/states"):
    constructed_path = os.path.join(states_path, file)
    if constructed_path.endswith(".gpkg"):
        rc_gdf = gpd.read_file(constructed_path)
        store_gdfs.append(rc_gdf)

In [37]:
rc_gdf = pd.concat(store_gdfs, ignore_index=True)

In [38]:
rc_gdf.head()

Unnamed: 0,STUSPS,ID,URL,geometry
0,AK,129039,https://locations.raisingcanes.com//ak/anchora...,POINT (-149.8663 61.145)
1,AK,129039,https://locations.raisingcanes.com//ak/anchora...,POINT (-149.74105 61.22994)
2,AK,129039,https://locations.raisingcanes.com//ak/wasilla...,POINT (-149.41559 61.57815)
3,AL,129039,https://locations.raisingcanes.com//al/mobile/...,POINT (-88.12526 30.6779)
4,AL,129039,https://locations.raisingcanes.com//al/tuscalo...,POINT (-87.54641 33.21641)


In [39]:
rc_gdf = rc_gdf.to_crs(9311)
rc_gdf.to_file(f"data/stores.gpkg")

## Get Population Data

In [40]:
state_populations = pd.read_excel(
    "data/NST-EST2024-POP.xlsx", sheet_name=None, engine="openpyxl"
)

In [41]:
state_populations_df = state_populations["NST-EST2024-POP"][
    [
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)",
        "Unnamed: 5",
    ]
]
state_populations_df = state_populations_df.rename(
    columns={
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)": "NAME",
        "Unnamed: 5": "POPULATION",
    }
)
state_populations_df["NAME"] = state_populations_df["NAME"].str[1:]

#### Manually Add Guam

In [42]:
state_populations_df.loc[-1] = ["Guam", 167777]
state_populations_df.index = state_populations_df.index + 1
state_populations_df = state_populations_df.sort_index()

In [43]:
states_with_population_df = states_df.merge(state_populations_df, on="NAME", how="left")
states_with_population_df = states_with_population_df[
    ["STUSPS", "NAME", "POPULATION", "geometry"]
]

In [44]:
states_with_population_df = states_with_population_df.dropna()

## Merge Data

In [45]:
rc_count_df = pd.DataFrame(
    rc_gdf.groupby("STUSPS").size(), columns=["RCs"]
).reset_index()

In [46]:
rc_count_gdf = states_with_population_df.merge(rc_count_df, on="STUSPS", how="left")
rc_count_gdf = rc_count_gdf.fillna(0)

In [47]:
rc_count_gdf["per_100k"] = (
    rc_count_gdf["RCs"] / (rc_count_gdf["POPULATION"] / 100000)
).round(decimals=2)
rc_count_gdf["per_1m"] = (
    rc_count_gdf["RCs"] / (rc_count_gdf["POPULATION"] / 1_000_000)
).round(decimals=1)

In [48]:
rc_count_gdf = rc_count_gdf.to_crs(9311)

In [49]:
rc_count_gdf.to_file("data/Raising_Canes_Per_Capita.gpkg")

In [50]:
rc_count_gdf[["RCs", "per_100k", "per_1m", "STUSPS"]].sort_values(
    "per_100k", ascending=False
)

Unnamed: 0,RCs,per_100k,per_1m,STUSPS
5,63.0,1.37,13.7,LA
10,16.0,0.8,8.0,NE
28,23.0,0.72,7.2,NV
15,200.0,0.65,6.5,TX
40,9.0,0.62,6.2,HI
42,1.0,0.6,6.0,GU
35,44.0,0.59,5.9,AZ
2,22.0,0.54,5.4,OK
45,64.0,0.54,5.4,OH
0,14.0,0.48,4.8,MS
