In [28]:
import os
import random
import requests
import time

In [29]:
from bs4 import BeautifulSoup
import geopandas as gpd
import pandas as pd

In [30]:
from shapely.geometry import Point
from tqdm.notebook import tqdm

## Get State Data

In [31]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

In [32]:
states_df = states_df[["STUSPS", "NAME", "geometry"]]

## Get Population data

In [33]:
state_populations = pd.read_excel(
    "data/NST-EST2024-POP.xlsx", sheet_name=None, engine="openpyxl"
)

In [34]:
state_populations_df = state_populations["NST-EST2024-POP"][
    [
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)",
        "Unnamed: 5",
    ]
]
state_populations_df = state_populations_df.rename(
    columns={
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)": "NAME",
        "Unnamed: 5": "POPULATION",
    }
)
state_populations_df["NAME"] = state_populations_df["NAME"].str[1:]

In [35]:
states_with_population_df = states_df.merge(state_populations_df, on="NAME", how="left")
states_with_population_df = states_with_population_df[
    ["STUSPS", "NAME", "POPULATION", "geometry"]
]

## Get Dollar General Data

In [39]:
store_list = []
url_base = "https://www.dollargeneral.com/store-directory/"
for i in tqdm(range(len(states_df)), desc="Parsing States"):
    # Get State code
    state_store_list = []
    state_code = states_df.iloc[i]["STUSPS"].lower()

    # If File exists continue
    if os.path.isfile(f"data/states/{state_code}.gpkg"):
        continue

    # Get all locations in a state
    url = os.path.join(url_base, state_code)
    r = requests.get(url)

    if r.status_code != 200:
        print(r.status_code, states_df.iloc[i]["NAME"], r.url)
        continue

    soup = BeautifulSoup(r.text, "html.parser")
    locations_ps = soup.find_all("p", {"class": "location-list-item"})

    # Parse all locations in a state
    for location_p in tqdm(locations_ps, desc=f"Parsing Locations In {state_code}"):
        href = location_p.find("a").attrs["href"]
        location_paramters = "/".join(location_p.find("a").attrs["href"].split("/")[5:])
        location_url = os.path.join(url_base, location_paramters)
        r_location = requests.get(location_url)
        if r_location.status_code != 200:
            print(location_paramters)
            continue
        location_soup = BeautifulSoup(r_location.text, "html.parser")
        location_cards = location_soup.find_all("div", {"class": "store__card"})

        # Multiple stores in locations, parse the stores per location
        for location_card in location_cards:
            try:
                store_id = (
                    location_card.find("a").attrs["href"].split("/")[-1].split(".")[0]
                )
            except Exception as e:
                print(store_id)
                continue

            # Get store lat,long
            store_url = f"https://www.dollargeneral.com/bin/omni/pickup/storeDetails?storeNumber={store_id}"
            r_store = requests.get(store_url)
            if r_store.status_code != 200:
                print(store_id)
                continue

            # Output to dict then append to list
            store_json = r_store.json()["storeDetails"]
            try:
                store_dict = {
                    "STORE_ID": str(store_id),
                    "STATE": store_json["st"],
                    "geometry": Point(store_json["lo"], store_json["la"]),
                }
            except Exception as e:
                print(location_card)
                continue
            state_store_list.append(store_dict)
            store_list.append(store_dict)
            time.sleep(random.uniform(0.01, 0.5))

    state_dollar_general_gdf = gpd.GeoDataFrame(state_store_list, crs=4326)
    state_dollar_general_gdf.to_file(f"data/states/{state_code}.gpkg")

Parsing States:   0%|          | 0/56 [00:00<?, ?it/s]

Parsing Locations In nm:   0%|          | 0/74 [00:00<?, ?it/s]

404 Puerto Rico https://www.dollargeneral.com/store-directory/pr


Parsing Locations In sd:   0%|          | 0/70 [00:00<?, ?it/s]

sd/faith.html


Parsing Locations In tx:   0%|          | 0/820 [00:00<?, ?it/s]

tx/dawson.html


Parsing Locations In ca:   0%|          | 0/202 [00:00<?, ?it/s]

Parsing Locations In al:   0%|          | 0/390 [00:00<?, ?it/s]

al/cottondale.html


Parsing Locations In ga:   0%|          | 0/464 [00:00<?, ?it/s]

ga/roswell.html


Parsing Locations In pa:   0%|          | 0/634 [00:00<?, ?it/s]

pa/new-alexandria.html


Parsing Locations In mo:   0%|          | 0/448 [00:00<?, ?it/s]

mo/otterville.html


Parsing Locations In co:   0%|          | 0/69 [00:00<?, ?it/s]

co/eads.html


Parsing Locations In ut:   0%|          | 0/13 [00:00<?, ?it/s]

Parsing Locations In tn:   0%|          | 0/383 [00:00<?, ?it/s]

tn/denmark.html


Parsing Locations In wy:   0%|          | 0/20 [00:00<?, ?it/s]

wy/afton.html
wy/buffalo.html
wy/thermopolis.html


Parsing Locations In ny:   0%|          | 0/478 [00:00<?, ?it/s]

ny/byron.html
ny/central-square.html
ny/pearl-river.html


Parsing Locations In ks:   0%|          | 0/207 [00:00<?, ?it/s]

404 Alaska https://www.dollargeneral.com/store-directory/ak


Parsing Locations In nv:   0%|          | 0/27 [00:00<?, ?it/s]

nv/henderson.html
30083


Parsing Locations In il:   0%|          | 0/529 [00:00<?, ?it/s]

Parsing Locations In vt:   0%|          | 0/37 [00:00<?, ?it/s]

Parsing Locations In mt:   0%|          | 0/10 [00:00<?, ?it/s]

mt/victor.html


Parsing Locations In ia:   0%|          | 0/275 [00:00<?, ?it/s]

Parsing Locations In sc:   0%|          | 0/249 [00:00<?, ?it/s]

Parsing Locations In nh:   0%|          | 0/48 [00:00<?, ?it/s]

Parsing Locations In az:   0%|          | 0/78 [00:00<?, ?it/s]

Parsing Locations In dc:   0%|          | 0/1 [00:00<?, ?it/s]

404 American Samoa https://www.dollargeneral.com/store-directory/as
404 United States Virgin Islands https://www.dollargeneral.com/store-directory/vi


Parsing Locations In nj:   0%|          | 0/146 [00:00<?, ?it/s]

Parsing Locations In md:   0%|          | 0/115 [00:00<?, ?it/s]

md/hurlock.html


Parsing Locations In me:   0%|          | 0/70 [00:00<?, ?it/s]

404 Hawaii https://www.dollargeneral.com/store-directory/hi


Parsing Locations In de:   0%|          | 0/31 [00:00<?, ?it/s]

404 Guam https://www.dollargeneral.com/store-directory/gu
404 Commonwealth of the Northern Mariana Islands https://www.dollargeneral.com/store-directory/mp


Parsing Locations In ri:   0%|          | 0/17 [00:00<?, ?it/s]

Parsing Locations In ky:   0%|          | 0/357 [00:00<?, ?it/s]

ky/mt.-vernon.html


Parsing Locations In oh:   0%|          | 0/559 [00:00<?, ?it/s]

Parsing Locations In wi:   0%|          | 0/265 [00:00<?, ?it/s]

wi/wyocena.html


Parsing Locations In or:   0%|          | 0/81 [00:00<?, ?it/s]

Parsing Locations In nd:   0%|          | 0/65 [00:00<?, ?it/s]

Parsing Locations In ar:   0%|          | 0/310 [00:00<?, ?it/s]

ar/gillham.html


Parsing Locations In in:   0%|          | 0/388 [00:00<?, ?it/s]

Parsing Locations In mn:   0%|          | 0/212 [00:00<?, ?it/s]

mn/anoka.html
mn/barnum.html
mn/litchfield.html
mn/ulen.html


Parsing Locations In ct:   0%|          | 0/74 [00:00<?, ?it/s]

ct/essex.html


In [40]:
dollar_general_gdf = gpd.GeoDataFrame(store_list, crs=4326)

#### If the above cells breaks just keep running until complete. Then run the line below

In [41]:
dollar_general_gdfs = []
states_path = "data/states"
for file in os.listdir("data/states"):
    constructed_path = os.path.join(states_path, file)
    if constructed_path.endswith(".gpkg"):
        dollar_general_gdf = gpd.read_file(constructed_path)
        dollar_general_gdfs.append(dollar_general_gdf)

In [42]:
dollar_general_gdf = gpd.GeoDataFrame(pd.concat(dollar_general_gdfs, ignore_index=True))

In [43]:
dollar_general_gdf = dollar_general_gdf.to_crs(9311)
dollar_general_gdf.to_file(f"data/stores.gpkg")

## Combine With States

In [45]:
dollar_generals_per_state_df = dollar_general_gdf.groupby("STATE").size().reset_index()
dollar_generals_per_state_df = dollar_generals_per_state_df.rename(
    columns={"STATE": "STUSPS", 0: "DOLLAR_GENERALS"}
)

In [46]:
dollar_general_by_states_gdf = states_with_population_df.merge(
    dollar_generals_per_state_df, on="STUSPS", how="left"
)

In [47]:
dollar_general_by_states_gdf = dollar_general_by_states_gdf.fillna(0)
dollar_general_by_states_gdf["DOLLAR_GENERALS"] = dollar_general_by_states_gdf[
    "DOLLAR_GENERALS"
].astype(int)

In [48]:
dollar_general_by_states_gdf["per_1000"] = dollar_general_by_states_gdf[
    "DOLLAR_GENERALS"
] / (dollar_general_by_states_gdf["POPULATION"] / 1000)
dollar_general_by_states_gdf["per_10k"] = dollar_general_by_states_gdf[
    "DOLLAR_GENERALS"
] / (dollar_general_by_states_gdf["POPULATION"] / 10_000)
dollar_general_by_states_gdf["per_100k"] = dollar_general_by_states_gdf[
    "DOLLAR_GENERALS"
] / (dollar_general_by_states_gdf["POPULATION"] / 100000)
dollar_general_by_states_gdf["per_500k"] = dollar_general_by_states_gdf[
    "DOLLAR_GENERALS"
] / (dollar_general_by_states_gdf["POPULATION"] / 500_000)
dollar_general_by_states_gdf["per_1m"] = dollar_general_by_states_gdf[
    "DOLLAR_GENERALS"
] / (dollar_general_by_states_gdf["POPULATION"] / 1_000_000)

In [30]:
dollar_general_by_states_gdf = dollar_general_by_states_gdf.dropna()

In [31]:
dollar_general_by_states_gdf = dollar_general_by_states_gdf.to_crs(9311)
dollar_general_by_states_gdf.to_file(f"data/stores_by_states.gpkg")