In [1]:
import os
import random
import json
import time

In [2]:
import cloudscraper
from bs4 import BeautifulSoup
import geopandas as gpd
import pandas as pd

In [3]:
from shapely.geometry import Point
from tqdm.notebook import tqdm

## Get State Data

In [4]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

In [5]:
states_df = states_df[["STUSPS", "NAME", "geometry"]]

## Get Data

In [6]:
scraper = cloudscraper.create_scraper()

In [7]:
def get_store_point(store_url: str) -> dict:
    r = scraper.get(store_url)
    soup = BeautifulSoup(r.text, "html.parser")
    div = soup.find("div", {"class": "dynamicBanner"})
    store_dict = json.loads(
        div.find_next("script", {"type": "application/ld+json"}).text
    )
    lat = float(store_dict["geo"]["latitude"])
    lng = float(store_dict["geo"]["longitude"])
    return Point(lng, lat)

In [8]:
def get_urls(url: str) -> list:
    r = scraper.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    urls = [li.find("a").attrs["href"] for li in soup.find_all("li", {"class": "mb-2"})]
    return urls

In [16]:
state_urls = get_urls("https://www.exxon.com/en/find-station/united-states")

store_error_urls = []

# Parse all locations in a state
for state_url in tqdm(state_urls, desc="Parsing States"):
    state_store_list = []
    state_name = state_url.split("/")[-1].replace("-", " ").lower()

    # If File exists continue
    if os.path.isfile(f"data/states/{state_name}.gpkg"):
        continue

    city_urls = get_urls(state_url)

    if not city_urls:
        print("Error getting city urls", state_name)
        continue

    for city_url in tqdm(city_urls, desc=f"Parsing Locations in {state_name}"):
        store_urls = get_urls(city_url)
        if not store_urls:
            print("Error getting city url", city_url)
            continue

        for store_url in store_urls:
            try:
                point = get_store_point(store_url)
                store_dict = {"geometry": point, "NAME": state_name, "url": store_url}
            except Exception as e:
                print("Error getting store url", store_url, str(e))
                store_error_urls.append({"state": state_name, "url": store_url})
                continue
            state_store_list.append(store_dict)
            time.sleep(random.uniform(0.01, 0.5))

    if state_store_list:
        store_gdf = gpd.GeoDataFrame(state_store_list, crs=4326)
        store_gdf.to_file(f"data/states/{state_name}.gpkg")

Parsing States:   0%|          | 0/45 [00:00<?, ?it/s]

Parsing Locations in north carolina:   0%|          | 0/223 [00:00<?, ?it/s]

In [62]:
store_dicts = []
errors_v2 = []
for store_error in tqdm(store_error_urls, desc="Parsing errors"):
    try:
        urls = get_urls(store_error["url"])
    except Exception as e:
        print(e, store_error["url"])
        continue
    for url in urls:
        try:
            r = scraper.get(url)
            soup = BeautifulSoup(r.text, "html.parser")
            div = soup.find("div", {"class": "dynamicBanner__title"})
            store_dict = json.loads(
                div.find_next("script", {"type": "application/ld+json"}).text
            )
            stusps = store_dict["address"]["addressCountry"]
            lat = float(store_dict["geo"]["latitude"])
            lng = float(store_dict["geo"]["longitude"])
            store_dict = {"geometry": Point(lng, lat), "STUSPS": stusps, "url": url}
            store_dicts.append(store_dict)
        except Exception as e:
            print(e, store_error["url"])
            errors_v2.append(store_error)
        time.sleep(random.uniform(0.01, 0.5))

Parsing errors:   0%|          | 0/67 [00:00<?, ?it/s]

Exceeded 30 redirects. https://www.exxon.com/en/find-station/stamford-ct-rosa’sminimart-200329445
Exceeded 30 redirects. https://www.exxon.com/en/find-station/rosharon-tx-savannamarketcafé-200329196
Exceeded 30 redirects. https://www.exxon.com/en/find-station/tyler-tx-triplej’s3chapelhill-200325852


In [69]:
store_dicts.extend(
    [
        {
            "geometry": Point(-73.52372981531605, 41.071676025798986),
            "STUSPS": "CT",
            "url": "https://www.exxon.com/en/find-station/stamford-ct-rosa%e2%80%99sminimart-200329445",
        },
        {
            "geometry": Point(-95.4534583077106, 29.50402319032228),
            "STUSPS": "TX",
            "url": "https://www.exxon.com/en/find-station/rosharon-tx-savannamarketcaf%c3%a9-200329196",
        },
        {
            "geometry": Point(-95.20254297076532, 32.315977513984244),
            "STUSPS": "TX",
            "url": "https://www.exxon.com/en/find-station/tyler-tx-triplej%e2%80%99s3chapelhill-200325852",
        },
    ]
)

In [93]:
extra_store_df = pd.DataFrame(store_dicts)

In [94]:
extra_store_df = states_df[["NAME", "STUSPS"]].merge(
    extra_store_df, on="STUSPS", how="inner"
)

In [95]:
extra_store_df = extra_store_df[["geometry", "url", "NAME"]]
extra_store_df["NAME"] = extra_store_df["NAME"].str.lower()

In [96]:
store_gdfs = []
states_path = "data/states"
for file in os.listdir("data/states"):
    constructed_path = os.path.join(states_path, file)
    if constructed_path.endswith(".gpkg"):
        gdf = gpd.read_file(constructed_path)
        store_gdfs.append(gdf)

In [97]:
store_gdfs.append(extra_store_df)

In [101]:
gdf = pd.concat(store_gdfs, ignore_index=True)
gdf = gdf.set_crs(4326)

In [102]:
gdf = gdf.to_crs(9311)
gdf.to_file(f"data/stores.gpkg")

## Get Population Data

In [103]:
state_populations = pd.read_excel(
    "data/NST-EST2024-POP.xlsx", sheet_name=None, engine="openpyxl"
)

In [104]:
state_populations_df = state_populations["NST-EST2024-POP"][
    [
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)",
        "Unnamed: 5",
    ]
]
state_populations_df = state_populations_df.rename(
    columns={
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)": "NAME",
        "Unnamed: 5": "POPULATION",
    }
)
state_populations_df["NAME"] = state_populations_df["NAME"].str[1:]

In [105]:
states_with_population_df = states_df.merge(state_populations_df, on="NAME", how="left")
states_with_population_df = states_with_population_df[
    ["STUSPS", "NAME", "POPULATION", "geometry"]
]

In [106]:
states_with_population_df = states_with_population_df.dropna()

In [107]:
states_with_population_df["NAME"] = states_with_population_df["NAME"].str.lower()

## Merge Data

In [108]:
count_df = pd.DataFrame(gdf.groupby("NAME").size(), columns=["COUNT"]).reset_index()

In [109]:
count_gdf = states_with_population_df.merge(count_df, on="NAME", how="left")
count_gdf = count_gdf.fillna(0)

In [114]:
count_gdf["per_100k"] = (count_gdf["COUNT"] / (count_gdf["POPULATION"] / 100000)).round(
    decimals=1
)
count_gdf["per_1m"] = (
    count_gdf["COUNT"] / (count_gdf["POPULATION"] / 1_000_000)
).round(decimals=1)
count_gdf["COUNT"] = count_gdf["COUNT"].astype(int)

In [115]:
count_gdf = count_gdf.to_crs(9311)

In [116]:
count_gdf.to_file("data/Exxon_Gas_Stations_Per_State.gpkg")

In [117]:
count_gdf[["COUNT", "per_100k", "per_1m", "NAME"]].sort_values(
    "per_100k", ascending=False
)

Unnamed: 0,COUNT,per_100k,per_1m,NAME
30,87,13.4,134.1,vermont
31,103,9.1,91.0,montana
4,159,9.0,89.8,west virginia
3,689,7.9,78.9,virginia
48,237,7.7,77.2,arkansas
34,103,7.3,73.5,new hampshire
36,48,7.0,69.8,district of columbia
51,245,6.7,67.3,connecticut
5,298,6.5,65.0,louisiana
15,1950,6.3,63.5,texas
