In [1]:
import os

In [2]:
from bs4 import BeautifulSoup
import cloudscraper
import geopandas as gpd
import pandas as pd

In [3]:
from shapely.geometry import Point
from tqdm.notebook import tqdm

## Get State Data

In [4]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

In [5]:
states_df = states_df[["STUSPS", "NAME", "geometry"]]

## Get Population data

In [6]:
state_populations = pd.read_excel(
    "data/NST-EST2023-POP.xlsx", sheet_name=None, engine="openpyxl"
)

In [7]:
state_populations_df = state_populations["NST-EST2023-POP"][
    [
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)",
        "Unnamed: 5",
    ]
]
state_populations_df = state_populations_df.rename(
    columns={
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)": "NAME",
        "Unnamed: 5": "POPULATION",
    }
)
state_populations_df["NAME"] = state_populations_df["NAME"].str[1:]

In [8]:
states_with_population_df = states_df.merge(state_populations_df, on="NAME", how="left")
states_with_population_df = states_with_population_df[
    ["STUSPS", "NAME", "POPULATION", "geometry"]
]

## Get Trader Joes Data

In [9]:
scraper = cloudscraper.create_scraper()

In [10]:
store_list = []
url_base = "https://locations.traderjoes.com/"
for i in tqdm(range(len(states_df)), desc="Parsing States"):
    # Get State code
    state_store_list = []
    state_code = states_df.iloc[i]["STUSPS"]

    # If File exists continue
    if os.path.isfile(f"data/states/{state_code}.gpkg"):
        continue

    # Get all locations in a state
    url = os.path.join(url_base, state_code)
    r = scraper.get(url)

    if r.status_code != 200:
        print(states_df.iloc[i]["NAME"])
        continue

    soup = BeautifulSoup(r.text, "html.parser")
    locations_as = soup.find_all("a", {"data-gaact": "Click_to_CityPage"})

    # Parse all locations in a state
    for locations_a in tqdm(locations_as, desc=f"Parsing Locations In {state_code}"):
        href = locations_a.attrs["href"]
        location_url = url_base + href
        r_location = scraper.get(location_url)
        if r_location.status_code != 200:
            print(location_paramters)
            continue

        ################ HERE
        location_soup = BeautifulSoup(r_location.text, "html.parser")
        location_cards = location_soup.find_all("a", {"class": "ga_w2gi_lp directions"})

        # Multiple stores in locations, parse the stores per location
        for location_card in location_cards:
            store_param = location_card.attrs["href"]

            # Get store lat,long
            store_url = url_base + store_param
            r_store = scraper.get(store_url)
            if r_store.status_code != 200:
                print(store_id)
                continue

            # Output to dict then append to list
            store_soup = BeautifulSoup(r_store.text, "html.parser")
            lat_meta = store_soup.find("meta", {"property": "place:location:latitude"})
            lat = float(lat_meta.attrs["content"])
            lon_meta = store_soup.find("meta", {"property": "place:location:longitude"})
            lon = float(lon_meta.attrs["content"])
            store_dict = {
                "STATE": state_code,
                "geometry": Point(lon, lat),
            }
            store_list.append(store_dict)
            state_store_list.append(store_dict)

    if state_store_list:
        state_trader_joes_gdf = gpd.GeoDataFrame(state_store_list, crs=4326)
        state_trader_joes_gdf.to_file(f"data/states/{state_code}.gpkg")

Parsing States:   0%|          | 0/56 [00:00<?, ?it/s]

Parsing Locations In MS: 0it [00:00, ?it/s]

Parsing Locations In NC:   0%|          | 0/10 [00:00<?, ?it/s]

Parsing Locations In OK:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Locations In VA:   0%|          | 0/17 [00:00<?, ?it/s]

Parsing Locations In WV: 0it [00:00, ?it/s]

Parsing Locations In LA:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Locations In MI:   0%|          | 0/9 [00:00<?, ?it/s]

Parsing Locations In MA:   0%|          | 0/20 [00:00<?, ?it/s]

Parsing Locations In ID:   0%|          | 0/3 [00:00<?, ?it/s]

Parsing Locations In FL:   0%|          | 0/24 [00:00<?, ?it/s]

Parsing Locations In NE:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Locations In WA:   0%|          | 0/18 [00:00<?, ?it/s]

Parsing Locations In NM:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Locations In PR: 0it [00:00, ?it/s]

Parsing Locations In SD: 0it [00:00, ?it/s]

Parsing Locations In TX:   0%|          | 0/12 [00:00<?, ?it/s]

Parsing Locations In CA:   0%|          | 0/149 [00:00<?, ?it/s]

Parsing Locations In AL:   0%|          | 0/3 [00:00<?, ?it/s]

Parsing Locations In GA:   0%|          | 0/7 [00:00<?, ?it/s]

Parsing Locations In PA:   0%|          | 0/11 [00:00<?, ?it/s]

Parsing Locations In MO:   0%|          | 0/5 [00:00<?, ?it/s]

Parsing Locations In CO:   0%|          | 0/7 [00:00<?, ?it/s]

Parsing Locations In UT:   0%|          | 0/4 [00:00<?, ?it/s]

Parsing Locations In TN:   0%|          | 0/6 [00:00<?, ?it/s]

Parsing Locations In WY: 0it [00:00, ?it/s]

Parsing Locations In NY:   0%|          | 0/21 [00:00<?, ?it/s]

Parsing Locations In KS:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Locations In AK: 0it [00:00, ?it/s]

Parsing Locations In NV:   0%|          | 0/5 [00:00<?, ?it/s]

Parsing Locations In IL:   0%|          | 0/17 [00:00<?, ?it/s]

Parsing Locations In VT:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations In MT: 0it [00:00, ?it/s]

Parsing Locations In IA:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Locations In SC:   0%|          | 0/3 [00:00<?, ?it/s]

Parsing Locations In NH:   0%|          | 0/3 [00:00<?, ?it/s]

Parsing Locations In AZ:   0%|          | 0/12 [00:00<?, ?it/s]

Parsing Locations In DC:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations In AS: 0it [00:00, ?it/s]

Parsing Locations In VI: 0it [00:00, ?it/s]

Parsing Locations In NJ:   0%|          | 0/19 [00:00<?, ?it/s]

Parsing Locations In MD:   0%|          | 0/8 [00:00<?, ?it/s]

Parsing Locations In ME:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations In HI: 0it [00:00, ?it/s]

Parsing Locations In DE:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Locations In GU: 0it [00:00, ?it/s]

Parsing Locations In MP: 0it [00:00, ?it/s]

Parsing Locations In RI:   0%|          | 0/2 [00:00<?, ?it/s]

Parsing Locations In KY:   0%|          | 0/3 [00:00<?, ?it/s]

Parsing Locations In OH:   0%|          | 0/7 [00:00<?, ?it/s]

Parsing Locations In WI:   0%|          | 0/3 [00:00<?, ?it/s]

Parsing Locations In OR:   0%|          | 0/12 [00:00<?, ?it/s]

Parsing Locations In ND: 0it [00:00, ?it/s]

Parsing Locations In AR:   0%|          | 0/1 [00:00<?, ?it/s]

Parsing Locations In IN:   0%|          | 0/3 [00:00<?, ?it/s]

Parsing Locations In MN:   0%|          | 0/10 [00:00<?, ?it/s]

Parsing Locations In CT:   0%|          | 0/9 [00:00<?, ?it/s]

#### If the above cells breaks just keep running until complete. Then run the line below

In [11]:
trader_joes_gdfs = []
states_path = "data/states"
for file in os.listdir("data/states"):
    constructed_path = os.path.join(states_path, file)
    if constructed_path.endswith(".gpkg"):
        state_trader_joes_gdf = gpd.read_file(constructed_path)
        trader_joes_gdfs.append(state_trader_joes_gdf)

In [12]:
trader_joes_gdf = gpd.GeoDataFrame(pd.concat(trader_joes_gdfs, ignore_index=True))

In [13]:
trader_joes_gdf = trader_joes_gdf.to_crs(9311)
trader_joes_gdf.to_file(f"data/stores.gpkg")

## Combine With States

In [14]:
trader_joes_state_counts_series = trader_joes_gdf.groupby("STATE").size()
trader_joes_state_counts_df = pd.DataFrame(
    {
        "STUSPS": trader_joes_state_counts_series.index,
        "stores": trader_joes_state_counts_series.values,
    }
)

In [15]:
trader_joes_by_states_gdf = states_with_population_df.merge(
    trader_joes_state_counts_df, on="STUSPS", how="left"
)

In [16]:
trader_joes_by_states_gdf = trader_joes_by_states_gdf.fillna(0)
trader_joes_by_states_gdf["stores"] = trader_joes_by_states_gdf["stores"].astype(int)

In [17]:
trader_joes_by_states_gdf["per_1000"] = trader_joes_by_states_gdf["stores"] / (
    trader_joes_by_states_gdf["POPULATION"] / 1000
)
trader_joes_by_states_gdf["per_10k"] = trader_joes_by_states_gdf["stores"] / (
    trader_joes_by_states_gdf["POPULATION"] / 10_000
)
trader_joes_by_states_gdf["per_100k"] = trader_joes_by_states_gdf["stores"] / (
    trader_joes_by_states_gdf["POPULATION"] / 100000
)
trader_joes_by_states_gdf["per_500k"] = trader_joes_by_states_gdf["stores"] / (
    trader_joes_by_states_gdf["POPULATION"] / 500_000
)
trader_joes_by_states_gdf["per_1m"] = trader_joes_by_states_gdf["stores"] / (
    trader_joes_by_states_gdf["POPULATION"] / 1_000_000
)

In [18]:
trader_joes_by_states_gdf = trader_joes_by_states_gdf.dropna()

In [19]:
trader_joes_by_states_gdf = trader_joes_by_states_gdf.to_crs(9311)
trader_joes_by_states_gdf.to_file(f"data/stores_by_states.gpkg")