In [1]:
import os
from collections import defaultdict
import random
import json
import time

In [2]:
import cloudscraper
from bs4 import BeautifulSoup
import geopandas as gpd
import pandas as pd

In [3]:
from shapely.geometry import Point
from tqdm.notebook import tqdm

## Get State Data

In [4]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

In [5]:
states_df = states_df[["STUSPS", "NAME", "geometry"]]

## Get Data

In [6]:
scraper = cloudscraper.create_scraper()

In [7]:
headers = {
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "accept-language": "en-US,en;q=0.9,ru;q=0.8,el;q=0.7",
    "cache-control": "no-cache",
    "pragma": "no-cache",
    "priority": "u=0, i",
    "sec-ch-ua": '"Google Chrome";v="141", "Not?A_Brand";v="8", "Chromium";v="141"',
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": '"Windows"',
    "sec-fetch-dest": "document",
    "sec-fetch-mode": "navigate",
    "sec-fetch-site": "same-origin",
    "sec-fetch-user": "?1",
    "upgrade-insecure-requests": "1",
    "Referer": "https://www.pepboys.com/stores",
}

In [8]:
def get_store_dicts(url: str) -> dict:
    r = scraper.get(url, headers=headers)
    soup = BeautifulSoup(r.text, "html.parser")

    lats = [
        float(lat.attrs["value"])
        for lat in soup.find_all("input", {"id": "storeMapLatitude"})
    ]
    lons = [
        float(lon.attrs["value"])
        for lon in soup.find_all("input", {"id": "storeMapLongitude"})
    ]

    if len(lats) != len(lons) or len(lats) == 0:
        raise ValueError("Lat/Lon length doesn't equal or no lats/lon")

    output = []
    for lat, lon in zip(lats, lons):
        point = Point(lon, lat)
        store_dict = {
            "geometry": point,
            "STUSPS": url.split("/")[-2].upper(),
            "URL": url,
        }
        output.append(store_dict)

    return output

In [9]:
r = scraper.get("https://www.pepboys.com/stores")
soup = BeautifulSoup(r.text, "html.parser")
lis = soup.find_all("li", {"class": "store-locator__home-list--link"})
store_urls = ["https://www.pepboys.com" + li.find("a").attrs["href"] for li in lis]

In [10]:
grouped = defaultdict(list)

for url in store_urls:
    parts = url.split("/")
    state = parts[4].upper()
    grouped[state].append(url)

states_group = dict(grouped)

In [11]:
# Parse all locations in a state
for state_code, store_urls in tqdm(states_group.items(), desc="Parsing States"):
    state_store_list = []

    # If File exists continue
    if os.path.isfile(f"data/states/{state_code}.gpkg"):
        continue

    for store_url in tqdm(store_urls, desc=f"Parsing Stores in {state_code}"):
        try:
            store_dicts = get_store_dicts(store_url)
            print(store_dicts)
        except Exception as e:
            print("Error getting store url", store_url, str(e))
            break
        state_store_list.extend(store_dicts)
        time.sleep(random.uniform(1, 2))

    if state_store_list:
        state_gdf = gpd.GeoDataFrame(state_store_list, crs=4326)
        state_gdf.to_file(f"data/states/{state_code}.gpkg")
    else:
        break;

Parsing States:   0%|          | 0/37 [00:00<?, ?it/s]

Parsing Stores in AL:   0%|          | 0/24 [00:00<?, ?it/s]

Error getting store url https://www.pepboys.com/stores/al/athens Lat/Lon length doesn't equal or no lats/lon


In [69]:
store_gdfs = []
states_path = "data/states"
for file in os.listdir("data/states"):
    constructed_path = os.path.join(states_path, file)
    if constructed_path.endswith(".gpkg"):
        pb_gdf = gpd.read_file(constructed_path)
        store_gdfs.append(pb_gdf)

In [70]:
pb_gdf = pd.concat(store_gdfs, ignore_index=True)

In [71]:
pb_gdf.head()

Unnamed: 0,STUSPS,URL,geometry
0,TN,https://www.pepboys.com/stores/tn/bartlett,POINT (-89.86336 35.20602)
1,TN,https://www.pepboys.com/stores/tn/chattanooga,POINT (-85.15337 35.03302)
2,TN,https://www.pepboys.com/stores/tn/collierville,POINT (-89.69952 35.04693)
3,TN,https://www.pepboys.com/stores/tn/farragut,POINT (-84.14454 35.89301)
4,TN,https://www.pepboys.com/stores/tn/knoxville,POINT (-83.88002 36.02876)


In [72]:
pb_gdf = pb_gdf.to_crs(9311)
pb_gdf.to_file(f"data/stores.gpkg")

## Get Population Data

In [73]:
state_populations = pd.read_excel(
    "data/NST-EST2024-POP.xlsx", sheet_name=None, engine="openpyxl"
)

In [74]:
state_populations_df = state_populations["NST-EST2024-POP"][
    [
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)",
        "Unnamed: 5",
    ]
]
state_populations_df = state_populations_df.rename(
    columns={
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)": "NAME",
        "Unnamed: 5": "POPULATION",
    }
)
state_populations_df["NAME"] = state_populations_df["NAME"].str[1:]

In [75]:
states_with_population_df = states_df.merge(state_populations_df, on="NAME", how="left")
states_with_population_df = states_with_population_df[
    ["STUSPS", "NAME", "POPULATION", "geometry"]
]

In [76]:
states_with_population_df = states_with_population_df.dropna()

## Merge Data

In [77]:
pb_count_df = pd.DataFrame(
    pb_gdf.groupby("STUSPS").size(), columns=["PBs"]
).reset_index()

In [78]:
pb_count_gdf = states_with_population_df.merge(pb_count_df, on="STUSPS", how="left")
pb_count_gdf = pb_count_gdf.fillna(0)

In [79]:
pb_count_gdf["per_100k"] = (
    pb_count_gdf["PBs"] / (pb_count_gdf["POPULATION"] / 100000)
).round(decimals=2)
pb_count_gdf["per_1m"] = (
    pb_count_gdf["PBs"] / (pb_count_gdf["POPULATION"] / 1_000_000)
).round(decimals=1)

In [80]:
pb_count_gdf = pb_count_gdf.to_crs(9311)

In [81]:
pb_count_gdf.to_file("data/Pep_Boys_Locations_Per_State.gpkg")

In [82]:
pb_count_gdf[["PBs", "per_100k", "per_1m", "STUSPS"]].sort_values(
    "PBs", ascending=False
)

Unnamed: 0,PBs,per_100k,per_1m,STUSPS
23,16.0,0.22,2.2,TN
0,0.0,0.0,0.0,MS
2,0.0,0.0,0.0,OK
1,0.0,0.0,0.0,NC
4,0.0,0.0,0.0,WV
5,0.0,0.0,0.0,LA
6,0.0,0.0,0.0,MI
7,0.0,0.0,0.0,MA
8,0.0,0.0,0.0,ID
9,0.0,0.0,0.0,FL
