In [1]:
import json
import os
import random
import time

In [2]:
from bs4 import BeautifulSoup
import cloudscraper
import geopandas as gpd
import pandas as pd

In [3]:
from shapely.geometry import Point
from tqdm.notebook import tqdm

## Get State Data

In [4]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

In [5]:
states_df = states_df[["STUSPS", "NAME", "geometry"]]

## Get Population data

In [6]:
state_populations = pd.read_excel(
    "data/NST-EST2024-POP.xlsx", sheet_name=None, engine="openpyxl"
)

In [7]:
state_populations_df = state_populations["NST-EST2024-POP"][
    [
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)",
        "Unnamed: 5",
    ]
]
state_populations_df = state_populations_df.rename(
    columns={
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)": "NAME",
        "Unnamed: 5": "POPULATION",
    }
)
state_populations_df["NAME"] = state_populations_df["NAME"].str[1:]

In [8]:
states_with_population_df = states_df.merge(state_populations_df, on="NAME", how="left")
states_with_population_df = states_with_population_df[
    ["STUSPS", "NAME", "POPULATION", "geometry"]
]

## Get Guitar Center Data

In [9]:
scraper = cloudscraper.create_scraper()

In [10]:
def get_coords(soup) -> dict:
    store_json = json.loads(soup.find("script", {"type": "application/ld+json"}).text)[
        0
    ]
    lat = float(store_json["geo"]["latitude"])
    lon = float(store_json["geo"]["longitude"])
    return Point(lon, lat)

In [27]:
url_root = "https://stores.guitarcenter.com"
r = scraper.get(os.path.join(url_root, "browse"))
soup = BeautifulSoup(r.text, "html.parser")
state_as = soup.find_all("a", {"class": "ga-link"})[1:-1]

for state_a in tqdm(state_as, desc="Parsing States"):
    # Get State code
    state_store_list = []
    state_url = state_a.attrs["href"]
    state_code = state_url.split("/")[-2].upper()

    # If File exists continue
    if os.path.isfile(f"data/states/{state_code}.gpkg"):
        continue

    r_state = scraper.get(state_url)
    soup_state = BeautifulSoup(r_state.text, "html.parser")
    location_divs = soup_state.find_all("div", {"class": "map-list-item is-single"})

    for location_div in tqdm(location_divs, desc=f"Parsing stores in {state_code}"):
        store_url = location_div.find("a").attrs["href"]
        r_store = scraper.get(store_url)
        soup_store = BeautifulSoup(r_store.text, "html.parser")
        try:
            point = get_coords(soup_store)
            store_dict = {"STUSPS": state_code, "geometry": point}
            state_store_list.append(store_dict)
        except Exception as e:
            print(e, store_url)
            store_as = soup_store.find_all("a", {"class": "more-details ga-link"})
            for store_a in store_as:
                store_url = store_a.attrs["href"]
                r_store = scraper.get(store_url)
                soup_store = BeautifulSoup(r_store.text, "html.parser")
                point = get_coords(soup_store)
                store_dict = {"STUSPS": state_code, "geometry": point}
                state_store_list.append(store_dict)

        time.sleep(random.uniform(0.01, 0.5))

    if state_store_list:
        state_guitar_centers_gdf = gpd.GeoDataFrame(state_store_list, crs=4326)
        state_guitar_centers_gdf.to_file(f"data/states/{state_code}.gpkg")

Parsing States:   0%|          | 0/47 [00:00<?, ?it/s]

Parsing stores in AZ:   0%|          | 0/6 [00:00<?, ?it/s]

'geo' https://stores.guitarcenter.com/az/tucson/


Parsing stores in FL:   0%|          | 0/21 [00:00<?, ?it/s]

'geo' https://stores.guitarcenter.com/fl/jacksonville/


Parsing stores in NV:   0%|          | 0/2 [00:00<?, ?it/s]

'geo' https://stores.guitarcenter.com/nv/las-vegas/


Parsing stores in TX:   0%|          | 0/31 [00:00<?, ?it/s]

'geo' https://stores.guitarcenter.com/tx/austin/
'geo' https://stores.guitarcenter.com/tx/fort-worth/
'geo' https://stores.guitarcenter.com/tx/houston/


#### If the above cells breaks just keep running until complete. Then run the line below

In [28]:
state_guitar_centers_gdfs = []
states_path = "data/states"
for file in os.listdir("data/states"):
    constructed_path = os.path.join(states_path, file)
    if constructed_path.endswith(".gpkg"):
        state_guitar_centers_gdf = gpd.read_file(constructed_path)
        state_guitar_centers_gdfs.append(state_guitar_centers_gdf)

In [29]:
state_guitar_centers_gdf = gpd.GeoDataFrame(
    pd.concat(state_guitar_centers_gdfs, ignore_index=True)
)

In [30]:
state_guitar_centers_gdf = state_guitar_centers_gdf.to_crs(9311)
state_guitar_centers_gdf.to_file(f"data/guitar_centers.gpkg")

## Combine With States

In [31]:
guitar_centers_counts_df = (
    state_guitar_centers_gdf.groupby("STUSPS").size().reset_index()
)
guitar_centers_counts_df = guitar_centers_counts_df.rename(columns={0: "stores"})

In [32]:
guitar_centers_per_state_gdf = states_with_population_df.merge(
    guitar_centers_counts_df, on="STUSPS", how="left"
)

In [33]:
guitar_centers_per_state_gdf = guitar_centers_per_state_gdf.fillna(0)
guitar_centers_per_state_gdf["stores"] = guitar_centers_per_state_gdf["stores"].astype(
    int
)

In [34]:
guitar_centers_per_state_gdf["per_1000"] = guitar_centers_per_state_gdf["stores"] / (
    guitar_centers_per_state_gdf["POPULATION"] / 1000
)
guitar_centers_per_state_gdf["per_10k"] = guitar_centers_per_state_gdf["stores"] / (
    guitar_centers_per_state_gdf["POPULATION"] / 10_000
)
guitar_centers_per_state_gdf["per_100k"] = guitar_centers_per_state_gdf["stores"] / (
    guitar_centers_per_state_gdf["POPULATION"] / 100000
)
guitar_centers_per_state_gdf["per_500k"] = guitar_centers_per_state_gdf["stores"] / (
    guitar_centers_per_state_gdf["POPULATION"] / 500_000
)
guitar_centers_per_state_gdf["per_1m"] = guitar_centers_per_state_gdf["stores"] / (
    guitar_centers_per_state_gdf["POPULATION"] / 1_000_000
)

In [35]:
guitar_centers_per_state_gdf = guitar_centers_per_state_gdf.dropna()

In [36]:
guitar_centers_per_state_gdf = guitar_centers_per_state_gdf.to_crs(9311)
guitar_centers_per_state_gdf.to_file(f"data/guitar_centers_per_state.gpkg")