In [None]:
import httpx
import os
import random
import time

In [46]:
from bs4 import BeautifulSoup
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
from tqdm.notebook import tqdm

## Get State Data

In [47]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

In [48]:
states_df = states_df[["STUSPS", "NAME", "geometry"]]

## Get Population data

In [49]:
state_populations = pd.read_excel(
    "data/NST-EST2024-POP.xlsx", sheet_name=None, engine="openpyxl"
)

In [50]:
state_populations_df = state_populations["NST-EST2024-POP"][
    [
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)",
        "Unnamed: 5",
    ]
]
state_populations_df = state_populations_df.rename(
    columns={
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)": "NAME",
        "Unnamed: 5": "POPULATION",
    }
)
state_populations_df["NAME"] = state_populations_df["NAME"].str[1:]

In [51]:
states_with_population_df = states_df.merge(state_populations_df, on="NAME", how="left")
states_with_population_df = states_with_population_df[
    ["STUSPS", "NAME", "POPULATION", "geometry"]
]

## Get Food Lion Data

In [52]:
def get_coords(url: str) -> Point:
    r = httpx.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    meta = soup.find("meta", {"name": "geo.position"})
    coords = meta.attrs["content"].split(";")
    return Point(float(coords[1]), float(coords[0]))

In [53]:
url = "https://stores.foodlion.com/"
r = httpx.get(url)

In [54]:
soup = BeautifulSoup(r.text, "html.parser")

In [56]:
state_as = soup.find_all("a", {"class": "Directory-listLink"})
for state_a in tqdm(state_as, desc="Parsing States"):
    # state store list
    state_stores = []

    # Format state info
    state_code = state_a.attrs["href"]
    state_url = url + state_code

    # If File exists continue
    if os.path.isfile(f"data/states/{state_code}.gpkg"):
        continue

    # Request all cities in state
    state_r = httpx.get(state_url)

    # Get all cities from html
    state_soup = BeautifulSoup(state_r.text, "html.parser")
    city_as = state_soup.find_all("a", {"class": "Directory-listLink"})

    # Loop through all cities
    for city_a in tqdm(city_as, desc=f"Parsing Locations in {state_code}"):
        city_href = city_a.attrs["href"]
        href_split = city_href.split("/")
        city_url = url + city_href

        # If cities have more than 1 store
        if len(href_split) == 2:
            city_url = url + city_href
            r_stores = httpx.get(city_url)
            stores_soup = BeautifulSoup(r_stores.text, "html.parser")
            stores_div = stores_soup.find_all(
                "div", {"class": "Teaser-link Teaser-visitpage"}
            )
            for store_div in stores_div:
                store_a = store_div.find("a")
                store_url = store_a.attrs["href"].replace("../", url)
                try:
                    point = get_coords(store_url)
                except Exception as e:
                    print(store_url, str(e))
                    continue
                store_dict = {
                    "STUSPS": state_code.upper(),
                    "geometry": point,
                    "url": city_url,
                }
                state_stores.append(store_dict)
                time.sleep(random.uniform(0.01, 0.5))
        else:
            try:
                point = get_coords(city_url)
            except Exception as e:
                print(city_url, str(e))
                continue
            store_dict = {
                "STUSPS": state_code.upper(),
                "geometry": point,
                "url": city_url,
            }
            state_stores.append(store_dict)
            time.sleep(random.uniform(0.01, 0.5))

    state_stores_gdf = gpd.GeoDataFrame(state_stores, crs=4326)
    state_stores_gdf.to_file(f"data/states/{state_code}.gpkg")

Parsing States:   0%|          | 0/10 [00:00<?, ?it/s]

Parsing Locations in ga:   0%|          | 0/38 [00:00<?, ?it/s]

Parsing Locations in ky:   0%|          | 0/3 [00:00<?, ?it/s]

Parsing Locations in md:   0%|          | 0/45 [00:00<?, ?it/s]

Parsing Locations in nc:   0%|          | 0/244 [00:00<?, ?it/s]

Parsing Locations in pa:   0%|          | 0/3 [00:00<?, ?it/s]

Parsing Locations in sc:   0%|          | 0/87 [00:00<?, ?it/s]

Parsing Locations in tn:   0%|          | 0/24 [00:00<?, ?it/s]

Parsing Locations in va:   0%|          | 0/143 [00:00<?, ?it/s]

Parsing Locations in wv:   0%|          | 0/14 [00:00<?, ?it/s]

In [57]:
stores_gdfs = []
states_path = "data/states"
for file in os.listdir("data/states"):
    constructed_path = os.path.join(states_path, file)
    if constructed_path.endswith(".gpkg"):
        stores_gdf = gpd.read_file(constructed_path)
        stores_gdfs.append(stores_gdf)

In [58]:
store_gdf = gpd.GeoDataFrame(pd.concat(stores_gdfs, ignore_index=True))

In [59]:
store_gdf = store_gdf.to_crs(9311)
store_gdf.to_file(f"data/stores.gpkg")

In [60]:
state_counts_df = (
    store_gdf.groupby("STUSPS")
    .size()
    .reset_index(name="COUNT")
    .sort_values("COUNT", ascending=False)
)

## Combine With States

In [69]:
store_count_gdf = states_with_population_df.merge(
    state_counts_df, on="STUSPS", how="left"
)

In [70]:
store_count_gdf = store_count_gdf.dropna()

In [77]:
store_count_gdf["per_100k"] = (
    store_count_gdf["COUNT"] / (store_count_gdf["POPULATION"] / 100_000)
).round(decimals=2)
store_count_gdf["per_1m"] = (
    store_count_gdf["COUNT"] / (store_count_gdf["POPULATION"] / 1_000_000)
).round(decimals=1)

In [78]:
store_count_gdf = store_count_gdf.to_crs(9311)
store_count_gdf.to_file(f"data/food_lions_per_state.gpkg")

In [79]:
store_count_gdf

Unnamed: 0,STUSPS,NAME,POPULATION,geometry,COUNT,per_100k,per_1m
1,NC,North Carolina,10881189.0,"MULTIPOLYGON (((2154808.027 -692253.992, 21554...",532.0,4.89,48.9
3,VA,Virginia,8734685.0,"MULTIPOLYGON (((2098153.247 -490400.387, 20981...",297.0,3.4,34.0
4,WV,West Virginia,1770495.0,"POLYGON ((1506482.53 -601393.056, 1506489.516 ...",15.0,0.85,8.5
18,GA,Georgia,11064432.0,"MULTIPOLYGON (((1774018.899 -1324581.179, 1774...",47.0,0.42,4.2
19,PA,Pennsylvania,13017721.0,"POLYGON ((1619177.315 -262060.235, 1618879.478...",3.0,0.02,0.2
23,TN,Tennessee,7148304.0,"POLYGON ((883547.508 -1059435.53, 883522.954 -...",27.0,0.38,3.8
33,SC,South Carolina,5387830.0,"MULTIPOLYGON (((1898434.953 -1099843.186, 1898...",176.0,3.27,32.7
40,MD,Maryland,6217062.0,"MULTIPOLYGON (((2067172.995 -478871.787, 20670...",53.0,0.85,8.5
43,DE,Delaware,1036423.0,"MULTIPOLYGON (((2060773.554 -301785.14, 206090...",20.0,1.93,19.3
47,KY,Kentucky,4550595.0,"MULTIPOLYGON (((946135.474 -881445.948, 946571...",4.0,0.09,0.9
