In [52]:
from collections import Counter
import os
import random
import re
import time

In [2]:
from bs4 import BeautifulSoup
import cloudscraper
import geopandas as gpd
import pandas as pd

In [3]:
from shapely.geometry import Point
from tqdm.notebook import tqdm

## Get State Data

In [4]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

In [5]:
states_df = states_df[["STUSPS", "NAME", "geometry"]]

## Get Population data

In [6]:
state_populations = pd.read_excel(
    "data/NST-EST2024-POP.xlsx", sheet_name=None, engine="openpyxl"
)

In [7]:
state_populations_df = state_populations["NST-EST2024-POP"][
    [
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)",
        "Unnamed: 5",
    ]
]
state_populations_df = state_populations_df.rename(
    columns={
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)": "NAME",
        "Unnamed: 5": "POPULATION",
    }
)
state_populations_df["NAME"] = state_populations_df["NAME"].str[1:]

In [8]:
states_with_population_df = states_df.merge(state_populations_df, on="NAME", how="left")
states_with_population_df = states_with_population_df[
    ["STUSPS", "NAME", "POPULATION", "geometry"]
]

## Get Nordstrom Data

In [9]:
scraper = cloudscraper.create_scraper()

In [44]:
def get_states(url: str, headers: dict) -> list:
    r = scraper.get(url, headers=headers)

    if r.status_code != 200:
        print(url, r.status_code, r.reason)
        return []

    soup = BeautifulSoup(r.text, "html.parser")
    state_divs = soup.find_all("div", {"class": "PiUHT fBKre hrzmx"})

    if not state_divs:
        print("No divs", url)
        return []
    else:
        return state_divs

In [45]:
def get_coords(url: str) -> Point:
    r = scraper.get(url)
    if r.status_code != 200:
        print("Status Code:", r.status_code, url)
        return {}
    try:
        soup = BeautifulSoup(r.text, "html.parser")
        lat = float(soup.find("meta", {"itemprop": "latitude"}).attrs["content"])
        lon = float(soup.find("meta", {"itemprop": "longitude"}).attrs["content"])
    except Exception as e:
        print(e, url)
    return Point(lon, lat)

In [46]:
def check_if_nordstrom_rack(url: str) -> bool:
    if url.split(".")[1] == "nordstromrack":
        return True
    else:
        return False

In [101]:
nordstrom_list = []
nordstrom_rack_list = []
nordstrom_state_list = []

states_url = os.path.join("https://stores.nordstromrack.com/us")
r_all_states = scraper.get(states_url)
soup = BeautifulSoup(r_all_states.text, "html.parser")
state_as = soup.find_all("a", {"class": "Directory-listLink"})

for state_a in tqdm(state_as, desc="Parsing States"):
    count = int(re.findall(r"\d+", state_a.attrs["data-count"])[0])
    state_url = state_a.attrs["href"]
    state_code = state_url.split("/")[4].upper()

    print(state_code)

    if count == 1:
        if not check_if_nordstrom_rack(state_url):
            print("Nordstrom", state_url)
            nordstrom_state_list.append(state_code)
            continue
        point = get_coords(state_url)
        store_dict = {"STATE": state_code, "geometry": point}
        if check_if_nordstrom_rack(state_url):
            nordstrom_rack_list.append(store_dict)
        else:
            nordstrom_list.append(store_dict)
        continue

    r_state = scraper.get(state_url)
    soup_state = BeautifulSoup(r_state.text, "html.parser")
    cities_as = soup_state.find_all("a", {"class": "Directory-listLink"})

    for cities_a in cities_as:
        city_url = "/".join(cities_a.attrs["href"].split("/")[:6])
        r_city = scraper.get(city_url)
        soup_city = BeautifulSoup(r_city.text, "html.parser")
        store_as = soup_city.find_all("a", {"class": "Teaser-titleLink"})
        for store_a in store_as:
            store_url = store_a.attrs["href"]
            if not check_if_nordstrom_rack(store_url):
                print("Nordstrom", store_url)
                nordstrom_state_list.append(state_code)
                continue
            point = get_coords(store_url)
            store_dict = {"STATE": state_code, "geometry": point}
            if check_if_nordstrom_rack(state_url):
                nordstrom_rack_list.append(store_dict)
            else:
                nordstrom_list.append(store_dict)

        time.sleep(random.uniform(0.01, 0.5))

Parsing States:   0%|          | 0/42 [00:00<?, ?it/s]

AK
AL
AZ
Nordstrom https://www.nordstrom.com/store-details/united-states/az/scottsdale/nordstrom-scottsdale-fashion-square
CA
Nordstrom https://www.nordstrom.com/store-details/united-states/ca/arcadia/nordstrom-santa-anita
Nordstrom https://www.nordstrom.com/store-details/united-states/ca/brea/nordstrom-brea-mall
Nordstrom https://www.nordstrom.com/store-details/united-states/ca/canoga-park/nordstrom-topanga
Nordstrom https://www.nordstrom.com/store-details/united-states/ca/cerritos/nordstrom-los-cerritos-center
Nordstrom https://www.nordstrom.com/store-details/united-states/ca/corte-madera/nordstrom-the-village-at-corte-madera
Nordstrom https://www.nordstrom.com/store-details/united-states/ca/costa-mesa/nordstrom-south-coast-plaza
Nordstrom https://www.nordstrom.com/store-details/united-states/ca/glendale/nordstrom-at-the-americana-at-brand
Nordstrom https://www.nordstrom.com/store-details/united-states/ca/irvine/nordstrom-irvine-spectrum-center
Nordstrom https://www.nordstrom.com/sto

In [62]:
nordstrom_rack_gdf = gpd.GeoDataFrame(nordstrom_rack_list, crs=4326)

In [63]:
nordstroms_per_state_df = pd.DataFrame.from_dict(
    Counter(nordstrom_state_list), orient="index"
).reset_index()
nordstroms_per_state_df = nordstroms_per_state_df.rename(
    columns={"index": "STUSPS", 0: "NORDSTROMS"}
)

In [64]:
nordstrom_rack_gdf = nordstrom_rack_gdf.to_crs(9311)
nordstrom_rack_gdf.to_file(f"data/nordstrom_rack_stores.gpkg")

In [80]:
nordstrom_rack_state_counts_df = pd.DataFrame(
    nordstrom_rack_gdf.groupby("STATE").size(), columns=["NORDSTROM_RACKS"]
)
nordstrom_rack_state_counts_df = nordstrom_rack_state_counts_df.reset_index()
nordstrom_rack_state_counts_df = nordstrom_rack_state_counts_df.rename(
    columns={"STATE": "STUSPS"}
)

## Combine With States

In [88]:
nordstrom_rack_by_states_gdf = states_with_population_df.merge(
    nordstrom_rack_state_counts_df, on="STUSPS", how="left"
)

In [89]:
nordstrom_by_states_gdf = nordstrom_rack_by_states_gdf.merge(
    nordstroms_per_state_df, on="STUSPS", how="left"
)

In [90]:
nordstrom_by_states_gdf = nordstrom_by_states_gdf.fillna(0)
nordstrom_by_states_gdf["NORDSTROM_RACKS"] = nordstrom_by_states_gdf[
    "NORDSTROM_RACKS"
].astype(int)
nordstrom_by_states_gdf["NORDSTROMS"] = nordstrom_by_states_gdf["NORDSTROMS"].astype(
    int
)

In [95]:
nordstrom_by_states_gdf["nordstrom_per_1000"] = nordstrom_by_states_gdf[
    "NORDSTROMS"
] / (nordstrom_by_states_gdf["POPULATION"] / 1000)
nordstrom_by_states_gdf["nordstrom_per_10k"] = nordstrom_by_states_gdf["NORDSTROMS"] / (
    nordstrom_by_states_gdf["POPULATION"] / 10_000
)
nordstrom_by_states_gdf["nordstrom_per_100k"] = nordstrom_by_states_gdf[
    "NORDSTROMS"
] / (nordstrom_by_states_gdf["POPULATION"] / 100000)
nordstrom_by_states_gdf["nordstrom_per_500k"] = nordstrom_by_states_gdf[
    "NORDSTROMS"
] / (nordstrom_by_states_gdf["POPULATION"] / 500_000)
nordstrom_by_states_gdf["nordstrom_per_1m"] = nordstrom_by_states_gdf["NORDSTROMS"] / (
    nordstrom_by_states_gdf["POPULATION"] / 1_000_000
)
nordstrom_by_states_gdf["nordstrom_rack_per_1000"] = nordstrom_by_states_gdf[
    "NORDSTROMS"
] / (nordstrom_by_states_gdf["POPULATION"] / 1000)
nordstrom_by_states_gdf["nordstrom_rack_per_10k"] = nordstrom_by_states_gdf[
    "NORDSTROMS"
] / (nordstrom_by_states_gdf["POPULATION"] / 10_000)
nordstrom_by_states_gdf["nordstrom_rack_per_100k"] = nordstrom_by_states_gdf[
    "NORDSTROMS"
] / (nordstrom_by_states_gdf["POPULATION"] / 100000)
nordstrom_by_states_gdf["nordstrom_rack_per_500k"] = nordstrom_by_states_gdf[
    "NORDSTROMS"
] / (nordstrom_by_states_gdf["POPULATION"] / 500_000)
nordstrom_by_states_gdf["nordstrom_rack_per_1m"] = nordstrom_by_states_gdf[
    "NORDSTROMS"
] / (nordstrom_by_states_gdf["POPULATION"] / 1_000_000)

In [96]:
nordstrom_by_states_gdf = nordstrom_by_states_gdf.dropna()

In [97]:
nordstrom_by_states_gdf = nordstrom_by_states_gdf.to_crs(9311)
nordstrom_by_states_gdf.to_file(f"data/nordstrom_per_state.gpkg")