In [59]:
import json
import os

In [2]:
from bs4 import BeautifulSoup
import cloudscraper
import geopandas as gpd
import pandas as pd

In [3]:
from shapely.geometry import Point
from tqdm.notebook import tqdm

## Get State Data

In [4]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

In [5]:
states_df = states_df[["STUSPS", "NAME", "geometry"]]

## Get Population data

In [6]:
state_populations = pd.read_excel(
    "data/NST-EST2024-POP.xlsx", sheet_name=None, engine="openpyxl"
)

In [7]:
state_populations_df = state_populations["NST-EST2024-POP"][
    [
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)",
        "Unnamed: 5",
    ]
]
state_populations_df = state_populations_df.rename(
    columns={
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)": "NAME",
        "Unnamed: 5": "POPULATION",
    }
)
state_populations_df["NAME"] = state_populations_df["NAME"].str[1:]

In [8]:
states_with_population_df = states_df.merge(state_populations_df, on="NAME", how="left")
states_with_population_df = states_with_population_df[
    ["STUSPS", "NAME", "POPULATION", "geometry"]
]

## Get Dollar Tree Data

In [11]:
scraper = cloudscraper.create_scraper()

In [67]:
store_list = []
url_base = "https://www.dollartree.com/locations/"
for i in tqdm(range(len(states_df)), desc="Parsing States"):
    # Get State code
    state_store_list = []
    state_code = states_df.iloc[i]["STUSPS"].lower()

    if state_code in ["pr", "as", "gu", "ak", "hi", "vi", "mp"]:
        continue

    # If File exists continue
    if os.path.isfile(f"data/states/{state_code}.gpkg"):
        continue

    # Get all locations in a state
    state_url = f"{url_base}{state_code}/"
    r_state = scraper.get(state_url)  # , allow_redirects=False)

    if r_state.status_code != 200:
        print(states_df.iloc[i]["NAME"])
        continue

    soup_state = BeautifulSoup(r_state.text, "html.parser")

    # Special for Alabama
    if state_code == "al":
        city_as = soup_state.find_all(
            "a",
            {
                "class": "Link group flex items-center gap-2 text-base font-normal !no-underline"
            },
        )
    else:
        city_as = soup_state.find_all(
            "a", {"class": "ga_w2gi_lp", "data-gaact": "Click_to_State_Level"}
        )

    for city_a in tqdm(city_as, desc=f"Parse Cities in {state_code}"):
        # Special for Alabama
        if state_code == "al":
            city_url = os.path.join(
                "https://locations.dollartree.com/", city_a.attrs["href"]
            )
        else:
            city_url = city_a.attrs["href"]
        r_city = scraper.get(city_url)
        soup_city = BeautifulSoup(r_city.text, "html.parser")

        # Special for Alabama
        if state_code == "al":
            store_divs = soup_city.find_all(
                "a",
                {"class": "Link text-green1 pb-4 text-lg font-bold hover:underline"},
            )
        else:
            store_divs = soup_city.find_all("div", {"class": "schemastore"})

        for store_div in store_divs:
            # Special for Alabama
            if state_code == "al":
                store_url = "https://locations.dollartree.com" + store_div.attrs["href"]
            else:
                store_url = store_div.find("a").attrs["href"]
            try:
                r_store = scraper.get(store_url)
                soup_store = BeautifulSoup(r_store.text, "html.parser")
                if state_code == "al":
                    store_json = json.loads(
                        soup_store.find("script", {"type": "application/ld+json"}).text
                    )
                    geo = store_json["@graph"][1]["geo"]
                    lat = geo["latitude"]
                    lon = geo["longitude"]
                else:
                    lat = float(
                        soup_store.find(
                            "meta", {"property": "place:location:latitude"}
                        ).attrs["content"]
                    )
                    lon = float(
                        soup_store.find(
                            "meta", {"property": "place:location:longitude"}
                        ).attrs["content"]
                    )
                store_dict = {"STUSPS": state_code.upper(), "geometry": Point(lon, lat)}
                state_store_list.append(store_dict)
                store_list.append(store_dict)
            except Exception as e:
                print(r_store.url, e)

    if state_store_list:
        state_dollar_tree_gdf = gpd.GeoDataFrame(state_store_list, crs=4326)
        state_dollar_tree_gdf.to_file(f"data/states/{state_code}.gpkg")

Parsing States:   0%|          | 0/56 [00:00<?, ?it/s]

Parse Cities in al:   0%|          | 0/113 [00:00<?, ?it/s]

https://locations.dollartree.com/al/attalla/952-gilberts-ferry-rd-se 'latitude'
https://locations.dollartree.com/al/sylacauga/110-curtis-liles-court 'latitude'


#### Add Records to Alabama

In [79]:
al_dollar_tree_gdf = gpd.read_file("data/states/al.gpkg")

In [80]:
added_dollar_trees = gpd.GeoDataFrame(
    [
        {"STUSPS": "AL", "geometry": Point(-86.0946542134915, 33.99784764740872)},
        {"STUSPS": "AL", "geometry": Point(-86.28959797300924, 33.17307641974139)},
    ],
    crs=4326,
)

In [81]:
al_dollar_tree_gdf = gpd.GeoDataFrame(
    pd.concat([al_dollar_tree_gdf, added_dollar_trees], ignore_index=True)
)

In [83]:
al_dollar_tree_gdf.to_file(f"data/states/al.gpkg")

#### If the above cells breaks just keep running until complete. Then run the line below

In [87]:
dollar_tree_gdfs = []
states_path = "data/states"
for file in os.listdir("data/states"):
    constructed_path = os.path.join(states_path, file)
    if constructed_path.endswith(".gpkg"):
        state_dollar_tree_gdf = gpd.read_file(constructed_path)
        dollar_tree_gdfs.append(state_dollar_tree_gdf)

In [88]:
dollar_tree_gdf = gpd.GeoDataFrame(pd.concat(dollar_tree_gdfs, ignore_index=True))

In [89]:
dollar_tree_gdf = dollar_tree_gdf.to_crs(9311)
dollar_tree_gdf.to_file(f"data/dollar_trees.gpkg")

## Combine With States

In [91]:
dollar_tree_per_state_df = dollar_tree_gdf.groupby("STUSPS").size().reset_index()
dollar_tree_per_state_df = dollar_tree_per_state_df.rename(
    columns={"STATE": "STUSPS", 0: "DOLLAR_TREE"}
)

In [93]:
dollar_tree_per_states_gdf = states_with_population_df.merge(
    dollar_tree_per_state_df, on="STUSPS", how="left"
)

In [94]:
dollar_tree_per_states_gdf = dollar_tree_per_states_gdf.fillna(0)
dollar_tree_per_states_gdf["DOLLAR_TREE"] = dollar_tree_per_states_gdf[
    "DOLLAR_TREE"
].astype(int)

In [95]:
dollar_tree_per_states_gdf["per_1000"] = dollar_tree_per_states_gdf["DOLLAR_TREE"] / (
    dollar_tree_per_states_gdf["POPULATION"] / 1000
)
dollar_tree_per_states_gdf["per_10k"] = dollar_tree_per_states_gdf["DOLLAR_TREE"] / (
    dollar_tree_per_states_gdf["POPULATION"] / 10_000
)
dollar_tree_per_states_gdf["per_100k"] = dollar_tree_per_states_gdf["DOLLAR_TREE"] / (
    dollar_tree_per_states_gdf["POPULATION"] / 100000
)
dollar_tree_per_states_gdf["per_500k"] = dollar_tree_per_states_gdf["DOLLAR_TREE"] / (
    dollar_tree_per_states_gdf["POPULATION"] / 500_000
)
dollar_tree_per_states_gdf["per_1m"] = dollar_tree_per_states_gdf["DOLLAR_TREE"] / (
    dollar_tree_per_states_gdf["POPULATION"] / 1_000_000
)

In [96]:
dollar_tree_per_states_gdf = dollar_tree_per_states_gdf.dropna()

In [97]:
dollar_tree_per_states_gdf = dollar_tree_per_states_gdf.to_crs(9311)
dollar_tree_per_states_gdf.to_file(f"data/dollar_trees_per_states.gpkg")