In [11]:
import json
import os
import random
import re
import time

In [12]:
from bs4 import BeautifulSoup
import cloudscraper
import geopandas as gpd
import pandas as pd

In [13]:
from shapely.geometry import Point
from tqdm.notebook import tqdm

## Get State Data

In [14]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

In [15]:
states_df = states_df[["STUSPS", "NAME", "geometry"]]

## Get Population data

In [16]:
state_populations = pd.read_excel(
    "data/NST-EST2024-POP.xlsx", sheet_name=None, engine="openpyxl"
)

In [17]:
state_populations_df = state_populations["NST-EST2024-POP"][
    [
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)",
        "Unnamed: 5",
    ]
]
state_populations_df = state_populations_df.rename(
    columns={
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)": "NAME",
        "Unnamed: 5": "POPULATION",
    }
)
state_populations_df["NAME"] = state_populations_df["NAME"].str[1:]

In [18]:
states_with_population_df = states_df.merge(state_populations_df, on="NAME", how="left")
states_with_population_df = states_with_population_df[
    ["STUSPS", "NAME", "POPULATION", "geometry"]
]

## Get Kohls Data

In [27]:
scraper = cloudscraper.create_scraper()

In [31]:
kohls_list = []

for i in tqdm(range(len(states_df)), desc="Parsing States"):
    state_store_list = []
    state_code = states_df.iloc[i]["STUSPS"].lower()

    # If File exists continue
    if os.path.isfile(f"data/states/{state_code}.gpkg"):
        continue

    r_state = scraper.get(
        f"https://www.kohls.com/stores/{state_code}.shtml", allow_redirects=False
    )
    if r_state.status_code != 200:
        print(state_code, r_state.url)

    soup_state = BeautifulSoup(r_state.text, "html.parser")
    city_as = soup_state.find_all("a", {"class": "ga-link"})

    for city_a in tqdm(city_as[:-1], desc=f"Parsing Cities in {state_code}"):
        city_url = f"https://www.kohls.com{city_a.attrs['href']}"
        r_city = scraper.get(city_url)
        soup_city = BeautifulSoup(r_city.text, "html.parser")
        store_divs = soup_city.find_all(
            "div", {"class": "map-list-links mt-10 rio-location-buttons"}
        )

        for store_div in store_divs:
            store_url = f"https://www.kohls.com{store_div.find('a').attrs['href']}"
            r_store = scraper.get(store_url)
            soup_store = BeautifulSoup(r_store.text, "html.parser")
            store_json = json.loads(
                soup_store.find("script", {"type": "application/ld+json"}).text
            )
            geo_dict = store_json[0]["geo"]
            store_dict = {
                "STATE": state_code.upper(),
                "geometry": Point(geo_dict["longitude"], geo_dict["latitude"]),
            }
            state_store_list.append(store_dict)
            kohls_list.append(store_dict)
            time.sleep(random.uniform(0.01, 0.5))

    if state_store_list:
        kohls_state_gdf = gpd.GeoDataFrame(state_store_list, crs=4326)
        kohls_state_gdf.to_file(f"data/states/{state_code}.gpkg")

Parsing States:   0%|          | 0/56 [00:00<?, ?it/s]

Parsing Cities in ms:   0%|          | 0/5 [00:00<?, ?it/s]

Parsing Cities in nc:   0%|          | 0/29 [00:00<?, ?it/s]

Parsing Cities in ok:   0%|          | 0/10 [00:00<?, ?it/s]

Parsing Cities in va:   0%|          | 0/28 [00:00<?, ?it/s]

Parsing Cities in wv:   0%|          | 0/8 [00:00<?, ?it/s]

Parsing Cities in la:   0%|          | 0/7 [00:00<?, ?it/s]

Parsing Cities in mi:   0%|          | 0/44 [00:00<?, ?it/s]

Parsing Cities in ma:   0%|          | 0/24 [00:00<?, ?it/s]

Parsing Cities in id:   0%|          | 0/6 [00:00<?, ?it/s]

Parsing Cities in fl:   0%|          | 0/48 [00:00<?, ?it/s]

Parsing Cities in ne:   0%|          | 0/6 [00:00<?, ?it/s]

Parsing Cities in wa:   0%|          | 0/20 [00:00<?, ?it/s]

Parsing Cities in nm:   0%|          | 0/3 [00:00<?, ?it/s]

pr https://www.kohls.com/stores/pr.shtml


Parsing Cities in pr: 0it [00:00, ?it/s]

Parsing Cities in sd:   0%|          | 0/3 [00:00<?, ?it/s]

Parsing Cities in tx:   0%|          | 0/76 [00:00<?, ?it/s]

Parsing Cities in ca:   0%|          | 0/107 [00:00<?, ?it/s]

Parsing Cities in al:   0%|          | 0/13 [00:00<?, ?it/s]

Parsing Cities in ga:   0%|          | 0/33 [00:00<?, ?it/s]

Parsing Cities in pa:   0%|          | 0/48 [00:00<?, ?it/s]

Parsing Cities in mo:   0%|          | 0/26 [00:00<?, ?it/s]

Parsing Cities in co:   0%|          | 0/20 [00:00<?, ?it/s]

Parsing Cities in ut:   0%|          | 0/12 [00:00<?, ?it/s]

Parsing Cities in tn:   0%|          | 0/19 [00:00<?, ?it/s]

Parsing Cities in wy:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Cities in ny:   0%|          | 0/50 [00:00<?, ?it/s]

Parsing Cities in ks:   0%|          | 0/11 [00:00<?, ?it/s]

Parsing Cities in ak:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Cities in nv:   0%|          | 0/7 [00:00<?, ?it/s]

Parsing Cities in il:   0%|          | 0/64 [00:00<?, ?it/s]

Parsing Cities in vt:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Cities in mt:   0%|          | 0/4 [00:00<?, ?it/s]

Parsing Cities in ia:   0%|          | 0/17 [00:00<?, ?it/s]

Parsing Cities in sc:   0%|          | 0/16 [00:00<?, ?it/s]

Parsing Cities in nh:   0%|          | 0/11 [00:00<?, ?it/s]

Parsing Cities in az:   0%|          | 0/18 [00:00<?, ?it/s]

dc https://www.kohls.com/stores/dc.shtml


Parsing Cities in dc: 0it [00:00, ?it/s]

as https://www.kohls.com/stores/as.shtml


Parsing Cities in as: 0it [00:00, ?it/s]

vi https://www.kohls.com/stores/vi.shtml


Parsing Cities in vi: 0it [00:00, ?it/s]

Parsing Cities in nj:   0%|          | 0/38 [00:00<?, ?it/s]

Parsing Cities in md:   0%|          | 0/23 [00:00<?, ?it/s]

Parsing Cities in me:   0%|          | 0/5 [00:00<?, ?it/s]

hi https://www.kohls.com/stores/hi.shtml


Parsing Cities in hi: 0it [00:00, ?it/s]

Parsing Cities in de:   0%|          | 0/4 [00:00<?, ?it/s]

gu https://www.kohls.com/stores/gu.shtml


Parsing Cities in gu: 0it [00:00, ?it/s]

mp https://www.kohls.com/stores/mp.shtml


Parsing Cities in mp: 0it [00:00, ?it/s]

Parsing Cities in ri:   0%|          | 0/4 [00:00<?, ?it/s]

Parsing Cities in ky:   0%|          | 0/15 [00:00<?, ?it/s]

Parsing Cities in oh:   0%|          | 0/55 [00:00<?, ?it/s]

Parsing Cities in wi:   0%|          | 0/38 [00:00<?, ?it/s]

Parsing Cities in or:   0%|          | 0/11 [00:00<?, ?it/s]

Parsing Cities in nd:   0%|          | 0/4 [00:00<?, ?it/s]

Parsing Cities in ar:   0%|          | 0/8 [00:00<?, ?it/s]

Parsing Cities in in:   0%|          | 0/34 [00:00<?, ?it/s]

Parsing Cities in mn:   0%|          | 0/29 [00:00<?, ?it/s]

Parsing Cities in ct:   0%|          | 0/20 [00:00<?, ?it/s]

In [32]:
kohls_state_gdfs = []
states_path = "data/states"
for file in os.listdir("data/states"):
    constructed_path = os.path.join(states_path, file)
    if constructed_path.endswith(".gpkg"):
        state_kohls_gdf = gpd.read_file(constructed_path)
        kohls_state_gdfs.append(state_kohls_gdf)

In [35]:
kohls_gdf = gpd.GeoDataFrame(pd.concat(kohls_state_gdfs, ignore_index=True), crs=4326)

In [36]:
kohls_gdf = kohls_gdf.to_crs(9311)
kohls_gdf.to_file(f"data/kohls.gpkg")

In [44]:
kohls_state_df = kohls_gdf.groupby("STATE").size()
kohls_state_counts_df = (
    pd.DataFrame(kohls_state_df, columns=["KOHLS"])
    .reset_index()
    .rename(columns={"STATE": "STUSPS"})
)

## Combine With States

In [45]:
kokls_by_states_gdf = states_with_population_df.merge(
    kohls_state_counts_df, on="STUSPS", how="left"
)

In [48]:
kokls_by_states_gdf["per_1000"] = kokls_by_states_gdf["KOHLS"] / (
    kokls_by_states_gdf["POPULATION"] / 1000
)
kokls_by_states_gdf["per_10k"] = kokls_by_states_gdf["KOHLS"] / (
    kokls_by_states_gdf["POPULATION"] / 10_000
)
kokls_by_states_gdf["per_100k"] = kokls_by_states_gdf["KOHLS"] / (
    kokls_by_states_gdf["POPULATION"] / 100000
)
kokls_by_states_gdf["per_500k"] = kokls_by_states_gdf["KOHLS"] / (
    kokls_by_states_gdf["POPULATION"] / 500_000
)
kokls_by_states_gdf["per_1m"] = kokls_by_states_gdf["KOHLS"] / (
    kokls_by_states_gdf["POPULATION"] / 1_000_000
)

In [49]:
kokls_by_states_gdf = kokls_by_states_gdf.dropna()

In [50]:
kokls_by_states_gdf = kokls_by_states_gdf.to_crs(9311)
kokls_by_states_gdf.to_file(f"data/kohls_per_state.gpkg")