In [2]:
import json
import os
import random
import time

In [3]:
from bs4 import BeautifulSoup
import cloudscraper
import geopandas as gpd
import pandas as pd

In [4]:
from shapely.geometry import Point
from tqdm.notebook import tqdm

## Get State Data

In [5]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

In [6]:
states_df = states_df[["STUSPS", "NAME", "geometry"]]

## Get Population data

In [7]:
state_populations = pd.read_excel(
    "data/NST-EST2024-POP.xlsx", sheet_name=None, engine="openpyxl"
)

In [8]:
state_populations_df = state_populations["NST-EST2024-POP"][
    [
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)",
        "Unnamed: 5",
    ]
]
state_populations_df = state_populations_df.rename(
    columns={
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)": "NAME",
        "Unnamed: 5": "POPULATION",
    }
)
state_populations_df["NAME"] = state_populations_df["NAME"].str[1:]

In [9]:
states_with_population_df = states_df.merge(state_populations_df, on="NAME", how="left")
states_with_population_df = states_with_population_df[
    ["STUSPS", "NAME", "POPULATION", "geometry"]
]

## Get Baskin Robbins Data

In [43]:
def get_coords(url: str) -> Point:
    r = scraper.get(url)
    if r.status_code != 200:
        print("Status Code:", r.status_code, url)
        return {}
    try:
        soup_store = BeautifulSoup(r.text, "html.parser")
        store_json = json.loads(
            "=".join(soup_store.find_all("script")[0].text.split("=")[1:])
        )
        coord = store_json["document"]["geocodedCoordinate"]
        lat = float(coord["latitude"])
        lon = float(coord["longitude"])
        return Point(lon, lat)
    except Exception as e:
        coord = store_json["document"]["yextDisplayCoordinate"]
        if "-".join(url.split("/")[-1].split("-")[:3]) == "123-dunkin-st":
            print(e, url, f"{coord['latitude']},{coord['longitude']}")
            return
        else:
            lat = float(coord["latitude"])
            lon = float(coord["longitude"])
            return Point(lon, lat)
        return

In [35]:
scraper = cloudscraper.create_scraper()

In [45]:
root_url = "https://locations.baskinrobbins.com/"
r_all_states = scraper.get(root_url)
soup = BeautifulSoup(r_all_states.text, "html.parser")
state_as = soup.find_all(
    "a",
    {
        "class": "Link Link--primary border-white inline-block w-full after:content-[attr(data-count)] after:ml-2 sm:border-brand-secondary sm:w-auto"
    },
)

for state_a in tqdm(state_as, desc="Parsing States"):
    state_code = state_a.attrs["href"].split("/")[0]
    state_store_list = []

    # If File exists continue
    if os.path.isfile(f"data/states/{state_code}.gpkg"):
        continue

    state_url = os.path.join(root_url, state_code)

    r_state = scraper.get(state_url)
    soup_state = BeautifulSoup(r_state.text, "html.parser")
    location_as = soup_state.find_all(
        "a",
        {
            "class": "Link Link--primary border-white inline-block w-full after:content-[attr(data-count)] after:ml-2 sm:border-brand-secondary sm:w-auto"
        },
    )

    for location_a in tqdm(location_as, desc=f"Parsing Locations in {state_code}"):
        location_href = "/".join(location_a.attrs["href"].split("/")[:2])
        location_url = os.path.join(root_url, location_href)
        r_location = scraper.get(location_url)
        soup_location = BeautifulSoup(r_location.text, "html.parser")
        store_as = soup_location.find_all(
            "a",
            {
                "class": "Link Directorycard block bg-white p-6 border-2 border-brand-gray-700 hover:border-brand-secondary h-full"
            },
        )

        for store_a in store_as:
            store_url = store_a.attrs["href"].replace("../", root_url)
            point = get_coords(store_url)
            if point:
                store_dict = {"STUSPS": state_code.upper(), "geometry": point}
            state_store_list.append(store_dict)

        time.sleep(random.uniform(0.01, 0.5))

    if state_store_list:
        baskin_robbins_gdf = gpd.GeoDataFrame(state_store_list, crs=4326)
        baskin_robbins_gdf.to_file(f"data/states/{state_code}.gpkg")

Parsing States:   0%|          | 0/45 [00:00<?, ?it/s]

Parsing Locations in al:   0%|          | 0/14 [00:00<?, ?it/s]

'geocodedCoordinate' https://locations.baskinrobbins.com/al/madison/123-dunkin-st-366109-br 34.7091114,-86.761749


Parsing Locations in ak:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations in az:   0%|          | 0/26 [00:00<?, ?it/s]

Parsing Locations in ar:   0%|          | 0/23 [00:00<?, ?it/s]

Parsing Locations in ca:   0%|          | 0/272 [00:00<?, ?it/s]

'geocodedCoordinate' https://locations.baskinrobbins.com/ca/chico/123-dunkin-st-366162-br 39.6237203,-121.8577205
'geocodedCoordinate' https://locations.baskinrobbins.com/ca/menifee/123-dunkin-st-366112-br 33.7441155,-117.1617685
'geocodedCoordinate' https://locations.baskinrobbins.com/ca/oroville/123-dunkin-st-366054-br 39.51747943,-121.58554353
'geocodedCoordinate' https://locations.baskinrobbins.com/ca/simi-valley/123-dunkin-st-366048-br 34.275113,-118.7095413


Parsing Locations in co:   0%|          | 0/22 [00:00<?, ?it/s]

Parsing Locations in ct:   0%|          | 0/6 [00:00<?, ?it/s]

Parsing Locations in de:   0%|          | 0/3 [00:00<?, ?it/s]

Parsing Locations in fl:   0%|          | 0/82 [00:00<?, ?it/s]

Parsing Locations in ga:   0%|          | 0/48 [00:00<?, ?it/s]

Parsing Locations in hi:   0%|          | 0/13 [00:00<?, ?it/s]

Parsing Locations in id:   0%|          | 0/6 [00:00<?, ?it/s]

Parsing Locations in il:   0%|          | 0/129 [00:00<?, ?it/s]

Parsing Locations in in:   0%|          | 0/22 [00:00<?, ?it/s]

Parsing Locations in ia:   0%|          | 0/4 [00:00<?, ?it/s]

Parsing Locations in ks:   0%|          | 0/12 [00:00<?, ?it/s]

Parsing Locations in ky:   0%|          | 0/15 [00:00<?, ?it/s]

Parsing Locations in la:   0%|          | 0/22 [00:00<?, ?it/s]

Parsing Locations in md:   0%|          | 0/60 [00:00<?, ?it/s]

Parsing Locations in ma:   0%|          | 0/3 [00:00<?, ?it/s]

Parsing Locations in mi:   0%|          | 0/44 [00:00<?, ?it/s]

Parsing Locations in mn:   0%|          | 0/11 [00:00<?, ?it/s]

Parsing Locations in ms:   0%|          | 0/32 [00:00<?, ?it/s]

Parsing Locations in mo:   0%|          | 0/22 [00:00<?, ?it/s]

Parsing Locations in mt:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Locations in ne:   0%|          | 0/3 [00:00<?, ?it/s]

Parsing Locations in nv:   0%|          | 0/7 [00:00<?, ?it/s]

Parsing Locations in nj:   0%|          | 0/60 [00:00<?, ?it/s]

Parsing Locations in nm:   0%|          | 0/13 [00:00<?, ?it/s]

Parsing Locations in ny:   0%|          | 0/120 [00:00<?, ?it/s]

Parsing Locations in nc:   0%|          | 0/36 [00:00<?, ?it/s]

Parsing Locations in oh:   0%|          | 0/24 [00:00<?, ?it/s]

Parsing Locations in ok:   0%|          | 0/10 [00:00<?, ?it/s]

Parsing Locations in or:   0%|          | 0/17 [00:00<?, ?it/s]

Parsing Locations in pa:   0%|          | 0/24 [00:00<?, ?it/s]

Parsing Locations in sc:   0%|          | 0/14 [00:00<?, ?it/s]

Parsing Locations in tn:   0%|          | 0/41 [00:00<?, ?it/s]

Parsing Locations in tx:   0%|          | 0/93 [00:00<?, ?it/s]

'geocodedCoordinate' https://locations.baskinrobbins.com/tx/denton/123-dunkin-st-366052-br 33.20266,-97.12299


Parsing Locations in ut:   0%|          | 0/16 [00:00<?, ?it/s]

Parsing Locations in va:   0%|          | 0/36 [00:00<?, ?it/s]

Parsing Locations in wa:   0%|          | 0/32 [00:00<?, ?it/s]

Parsing Locations in dc:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations in wv:   0%|          | 0/5 [00:00<?, ?it/s]

Parsing Locations in wi:   0%|          | 0/30 [00:00<?, ?it/s]

Parsing Locations in wy:   0%|          | 0/4 [00:00<?, ?it/s]

In [46]:
baskin_robbins_gdfs = []
states_path = "data/states"
for file in os.listdir("data/states"):
    constructed_path = os.path.join(states_path, file)
    if constructed_path.endswith(".gpkg"):
        baskin_robbins_gdf = gpd.read_file(constructed_path)
        baskin_robbins_gdfs.append(baskin_robbins_gdf)

In [47]:
baskin_robbins_gdf = gpd.GeoDataFrame(pd.concat(baskin_robbins_gdfs, ignore_index=True))

In [55]:
baskin_robbins_gdf = baskin_robbins_gdf.to_crs(9311)
baskin_robbins_gdf.to_file("data/baskin_robbins.gpkg")

In [56]:
baskin_robbins_counts_gdf = pd.DataFrame(
    baskin_robbins_gdf.groupby("STUSPS").size(), columns=["BASKIN_ROBBINS"]
)
baskin_robbins_counts_gdf = baskin_robbins_counts_gdf.reset_index()

## Combine With States

In [57]:
baskin_robbins_per_state_gdf = states_with_population_df.merge(
    baskin_robbins_counts_gdf, on="STUSPS", how="left"
)

In [58]:
baskin_robbins_per_state_gdf = baskin_robbins_per_state_gdf.fillna(0)
baskin_robbins_per_state_gdf["BASKIN_ROBBINS"] = baskin_robbins_per_state_gdf[
    "BASKIN_ROBBINS"
].astype(int)

In [59]:
baskin_robbins_per_state_gdf["per_1000"] = baskin_robbins_per_state_gdf[
    "BASKIN_ROBBINS"
] / (baskin_robbins_per_state_gdf["POPULATION"] / 1000)
baskin_robbins_per_state_gdf["per_10k"] = baskin_robbins_per_state_gdf[
    "BASKIN_ROBBINS"
] / (baskin_robbins_per_state_gdf["POPULATION"] / 10_000)
baskin_robbins_per_state_gdf["per_100k"] = baskin_robbins_per_state_gdf[
    "BASKIN_ROBBINS"
] / (baskin_robbins_per_state_gdf["POPULATION"] / 100000)
baskin_robbins_per_state_gdf["per_500k"] = baskin_robbins_per_state_gdf[
    "BASKIN_ROBBINS"
] / (baskin_robbins_per_state_gdf["POPULATION"] / 500_000)
baskin_robbins_per_state_gdf["per_1m"] = baskin_robbins_per_state_gdf[
    "BASKIN_ROBBINS"
] / (baskin_robbins_per_state_gdf["POPULATION"] / 1_000_000)

In [60]:
baskin_robbins_per_state_gdf = baskin_robbins_per_state_gdf.dropna()

In [61]:
baskin_robbins_per_state_gdf = baskin_robbins_per_state_gdf.to_crs(9311)
baskin_robbins_per_state_gdf.to_file("data/baskin_robbins_per_state.gpkg")