In [1]:
import json
import os
import random
import time

In [2]:
from bs4 import BeautifulSoup
import cloudscraper
import geopandas as gpd
import pandas as pd

In [3]:
from shapely.geometry import Point
from tqdm.notebook import tqdm

## Get State Data

In [4]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

In [5]:
states_df = states_df[["STUSPS", "NAME", "geometry"]]

## Get Population data

In [6]:
state_populations = pd.read_excel(
    "data/NST-EST2024-POP.xlsx", sheet_name=None, engine="openpyxl"
)

In [7]:
state_populations_df = state_populations["NST-EST2024-POP"][
    [
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)",
        "Unnamed: 5",
    ]
]
state_populations_df = state_populations_df.rename(
    columns={
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)": "NAME",
        "Unnamed: 5": "POPULATION",
    }
)
state_populations_df["NAME"] = state_populations_df["NAME"].str[1:]

In [8]:
states_with_population_df = states_df.merge(state_populations_df, on="NAME", how="left")
states_with_population_df = states_with_population_df[
    ["STUSPS", "NAME", "POPULATION", "geometry"]
]

## Get Macys Data

In [38]:
scraper = cloudscraper.create_scraper()

In [39]:
def get_locations(url: str) -> list:
    headers = {"User-Agent": "Mozilla/5.0"}

    r = scraper.get(url, headers=headers)
    if r.status_code != 200:
        print(url, r.status_code, r.reason)
        return []

    soup = BeautifulSoup(r.text, "html.parser")
    location_divs = soup.find_all("div", {"class": "map-list-item is-single"})

    if not location_divs:
        print("No Divs", url)
        return []
    else:
        return location_divs

In [40]:
def get_stores(url: str) -> list:
    r = scraper.get(url)
    if r.status_code != 200:
        print("Status Code:", r.status_code, url)
        return []
    soup = BeautifulSoup(r.text, "html.parser")
    cards = soup.find_all("a", {"class": "ga-link fw-600"})
    return cards

In [41]:
def get_coords(url: str) -> Point:
    r = scraper.get(url)
    if r.status_code != 200:
        print("Status Code:", r.status_code, url)
        return {}

    soup = BeautifulSoup(r.text, "html.parser")
    macys_store_json = json.loads(
        soup.find("script", {"type": "application/ld+json"}).text
    )[0]
    lat = float(macys_store_json["geo"]["latitude"])
    lon = float(macys_store_json["geo"]["longitude"])
    return Point(lon, lat)

In [42]:
store_list = []
url_base = "https://www.macys.com/stores/"

for i in tqdm(range(len(states_df)), desc="Parsing States"):
    # Get State code
    state_store_list = []
    state_code = states_df.iloc[i]["STUSPS"].lower()

    # If File exists continue
    if os.path.isfile(f"data/states/{state_code}.gpkg"):
        continue

    # Get all locations in a state
    url = os.path.join(url_base, state_code) + "/"
    locations_divs = get_locations(url)

    if not locations_divs:
        print(state_code)
        continue
    elif (
        locations_divs[0].find("a").attrs["href"] == "https://www.macys.com/stores/gu/"
    ):
        print(state_code)
        continue

    # Parse all locations in a state
    for locations_div in tqdm(
        locations_divs, desc=f"Parsing Locations In {state_code}"
    ):
        locations_a = locations_div.find("a")
        location_url = locations_a.attrs["href"]
        location_cards = get_stores(location_url)

        # Multiple stores in locations, parse the stores per location
        for location_card in location_cards:
            store_url = location_card.attrs["href"]
            point = get_coords(store_url)
            store_dict = {
                "STATE": state_code,
                "geometry": point,
            }
            store_list.append(store_dict)
            state_store_list.append(store_dict)
            time.sleep(random.uniform(0.01, 0.5))

    if state_store_list:
        state_macys_gdf = gpd.GeoDataFrame(state_store_list, crs=4326)
        state_macys_gdf.to_file(f"data/states/{state_code}.gpkg")

Parsing States:   0%|          | 0/56 [00:00<?, ?it/s]

ms


Parsing Locations In nc:   0%|          | 0/5 [00:00<?, ?it/s]

Parsing Locations In ok:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Locations In va:   0%|          | 0/14 [00:00<?, ?it/s]

Parsing Locations In wv:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations In la:   0%|          | 0/3 [00:00<?, ?it/s]

Parsing Locations In mi:   0%|          | 0/13 [00:00<?, ?it/s]

Parsing Locations In ma:   0%|          | 0/16 [00:00<?, ?it/s]

Parsing Locations In id:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Locations In fl:   0%|          | 0/29 [00:00<?, ?it/s]

ne


Parsing Locations In wa:   0%|          | 0/15 [00:00<?, ?it/s]

Parsing Locations In nm:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations In pr:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Locations In sd:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations In tx:   0%|          | 0/27 [00:00<?, ?it/s]

Parsing Locations In ca:   0%|          | 0/75 [00:00<?, ?it/s]

Parsing Locations In al:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations In ga:   0%|          | 0/15 [00:00<?, ?it/s]

Parsing Locations In pa:   0%|          | 0/18 [00:00<?, ?it/s]

Parsing Locations In mo:   0%|          | 0/7 [00:00<?, ?it/s]

Parsing Locations In co:   0%|          | 0/8 [00:00<?, ?it/s]

Parsing Locations In ut:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Locations In tn:   0%|          | 0/3 [00:00<?, ?it/s]

wy


Parsing Locations In ny:   0%|          | 0/28 [00:00<?, ?it/s]

Parsing Locations In ks:   0%|          | 0/2 [00:00<?, ?it/s]

ak


Parsing Locations In nv:   0%|          | 0/3 [00:00<?, ?it/s]

Parsing Locations In il:   0%|          | 0/18 [00:00<?, ?it/s]

vt


Parsing Locations In mt:   0%|          | 0/1 [00:00<?, ?it/s]

ia


Parsing Locations In sc:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Locations In nh:   0%|          | 0/4 [00:00<?, ?it/s]

Parsing Locations In az:   0%|          | 0/7 [00:00<?, ?it/s]

Parsing Locations In dc:   0%|          | 0/1 [00:00<?, ?it/s]

as
vi


Parsing Locations In nj:   0%|          | 0/23 [00:00<?, ?it/s]

Parsing Locations In md:   0%|          | 0/13 [00:00<?, ?it/s]

Parsing Locations In me:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations In hi:   0%|          | 0/6 [00:00<?, ?it/s]

Parsing Locations In de:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Locations In gu:   0%|          | 0/1 [00:00<?, ?it/s]

mp


Parsing Locations In ri:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Locations In ky:   0%|          | 0/3 [00:00<?, ?it/s]

Parsing Locations In oh:   0%|          | 0/14 [00:00<?, ?it/s]

Parsing Locations In wi:   0%|          | 0/4 [00:00<?, ?it/s]

Parsing Locations In or:   0%|          | 0/7 [00:00<?, ?it/s]

Parsing Locations In nd:   0%|          | 0/1 [00:00<?, ?it/s]

ar


Parsing Locations In in:   0%|          | 0/8 [00:00<?, ?it/s]

Parsing Locations In mn:   0%|          | 0/8 [00:00<?, ?it/s]

Parsing Locations In ct:   0%|          | 0/6 [00:00<?, ?it/s]

#### If the above cells breaks just keep running until complete. Then run the line below

In [43]:
macys_gdfs = []
states_path = "data/states"
for file in os.listdir("data/states"):
    constructed_path = os.path.join(states_path, file)
    if constructed_path.endswith(".gpkg"):
        state_macys_gdf = gpd.read_file(constructed_path)
        macys_gdfs.append(state_macys_gdf)

In [44]:
macys_gdf = gpd.GeoDataFrame(pd.concat(macys_gdfs, ignore_index=True))

In [45]:
macys_gdf = macys_gdf.to_crs(9311)
macys_gdf.to_file(f"data/stores.gpkg")

## Combine With States

In [58]:
macys_state_counts_series = macys_gdf.groupby("STATE").size()
macys_state_counts_df = pd.DataFrame(
    {
        "STUSPS": macys_state_counts_series.index.str.upper(),
        "stores": macys_state_counts_series.values,
    }
)

In [61]:
macys_by_states_gdf = states_with_population_df.merge(
    macys_state_counts_df, on="STUSPS", how="left"
)

In [62]:
macys_by_states_gdf = macys_by_states_gdf.fillna(0)
macys_by_states_gdf["stores"] = macys_by_states_gdf["stores"].astype(int)

In [63]:
macys_by_states_gdf["per_1000"] = macys_by_states_gdf["stores"] / (
    macys_by_states_gdf["POPULATION"] / 1000
)
macys_by_states_gdf["per_10k"] = macys_by_states_gdf["stores"] / (
    macys_by_states_gdf["POPULATION"] / 10_000
)
macys_by_states_gdf["per_100k"] = macys_by_states_gdf["stores"] / (
    macys_by_states_gdf["POPULATION"] / 100000
)
macys_by_states_gdf["per_500k"] = macys_by_states_gdf["stores"] / (
    macys_by_states_gdf["POPULATION"] / 500_000
)
macys_by_states_gdf["per_1m"] = macys_by_states_gdf["stores"] / (
    macys_by_states_gdf["POPULATION"] / 1_000_000
)

In [64]:
macys_by_states_gdf = macys_by_states_gdf.dropna()

In [65]:
macys_by_states_gdf = macys_by_states_gdf.to_crs(9311)
macys_by_states_gdf.to_file(f"data/macys_per_state.gpkg")