In [1]:
import json
import os
import random
import time

In [2]:
import cloudscraper
from bs4 import BeautifulSoup
import geopandas as gpd
import pandas as pd

In [3]:
from shapely.geometry import Point
from tqdm.notebook import tqdm

## Get State Data

In [4]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

In [5]:
states_df = states_df[["STUSPS", "NAME", "geometry"]]

## Get IHOP's Data

In [6]:
scraper = cloudscraper.create_scraper()

In [16]:
def get_coords(store_url: str) -> Point:
    href = "/".join(store_url.split("/")[4:])
    store_json_url = os.path.join(
        "https://api.wordlift.io/data/https/restaurants.ihop.com/en-us", href
    )
    r = scraper.get(store_json_url)
    if r.status_code != 200 or not r.json():
        r = scraper.get(store_url)
        soup = BeautifulSoup(r.text, "html.parser")
        store_json = json.loads(
            soup.find("script", {"type": "application/ld+json"}).text
        )[0]
    else:
        store_json = r.json()[0]
    lat = float(store_json["geo"]["latitude"])
    lon = float(store_json["geo"]["longitude"])
    return Point(lon, lat)

In [22]:
url_root = "https://restaurants.ihop.com/en-us/"
points = []
# Parse all locations in a state
for i in tqdm(range(len(states_df)), desc="Parsing States"):
    state_code = states_df.iloc[i]["STUSPS"].lower()
    state_url = f"{url_root}{state_code}/"
    state_store_list = []

    # If File exists continue
    if os.path.isfile(f"data/states/{state_code}.gpkg"):
        continue

    r_state = scraper.get(state_url)

    if r_state.status_code != 200:
        print(state_code)
        continue

    soup_state = BeautifulSoup(r_state.text, "html.parser")
    location_lis = soup_state.find_all("li", {"class": "map-list-item-wrap is-single"})
    city_as = [location_li.find("a") for location_li in location_lis]

    for city_a in tqdm(city_as, desc=f"Parsing Locations in {state_code}"):
        city_url = city_a.attrs["href"]
        r_city = scraper.get(city_url)
        soup_city = BeautifulSoup(r_city.text, "html.parser")
        store_divs = soup_city.find_all("div", {"class": "map-list-item-header"})
        store_as = [store_div.find("a") for store_div in store_divs]

        for store_a in store_as:
            try:
                store_url = store_a.attrs["href"]
                point = get_coords(store_url)
            except Exception as e:
                print(store_url, e)
                continue
            store_dict = {"STUSPS": state_code, "geometry": point}
            state_store_list.append(store_dict)
            time.sleep(random.uniform(0.01, 0.5))
            if point in points:
                print("Duplicate")
                print(store_url)
            points.append(point)

    if state_store_list:
        print(len(state_store_list), "stores")
        state_IHOP_gdf = gpd.GeoDataFrame(state_store_list, crs=4326)
        state_IHOP_gdf.to_file(f"data/states/{state_code}.gpkg")

Parsing States:   0%|          | 0/56 [00:00<?, ?it/s]

Parsing Locations in ms:   0%|          | 0/12 [00:00<?, ?it/s]

13 stores


Parsing Locations in nc:   0%|          | 0/40 [00:00<?, ?it/s]

https://restaurants.ihop.com/en-us/nc/rockingham/breakfast-714-w-us-74-hwy-3782 'geo'
51 stores


Parsing Locations in ok:   0%|          | 0/23 [00:00<?, ?it/s]

35 stores


Parsing Locations in va:   0%|          | 0/48 [00:00<?, ?it/s]

67 stores


Parsing Locations in wv:   0%|          | 0/9 [00:00<?, ?it/s]

10 stores


Parsing Locations in la:   0%|          | 0/21 [00:00<?, ?it/s]

29 stores


Parsing Locations in mi:   0%|          | 0/27 [00:00<?, ?it/s]

28 stores


Parsing Locations in ma:   0%|          | 0/21 [00:00<?, ?it/s]

21 stores


Parsing Locations in id:   0%|          | 0/6 [00:00<?, ?it/s]

7 stores


Parsing Locations in fl:   0%|          | 0/96 [00:00<?, ?it/s]

143 stores


Parsing Locations in ne:   0%|          | 0/5 [00:00<?, ?it/s]

7 stores


Parsing Locations in wa:   0%|          | 0/28 [00:00<?, ?it/s]

37 stores


Parsing Locations in nm:   0%|          | 0/12 [00:00<?, ?it/s]

15 stores


Parsing Locations in pr:   0%|          | 0/6 [00:00<?, ?it/s]

6 stores


Parsing Locations in sd:   0%|          | 0/2 [00:00<?, ?it/s]

2 stores


Parsing Locations in tx:   0%|          | 0/124 [00:00<?, ?it/s]

216 stores


Parsing Locations in ca:   0%|          | 0/169 [00:00<?, ?it/s]

223 stores


Parsing Locations in al:   0%|          | 0/13 [00:00<?, ?it/s]

15 stores


Parsing Locations in ga:   0%|          | 0/69 [00:00<?, ?it/s]

85 stores


Parsing Locations in pa:   0%|          | 0/22 [00:00<?, ?it/s]

28 stores


Parsing Locations in mo:   0%|          | 0/29 [00:00<?, ?it/s]

32 stores


Parsing Locations in co:   0%|          | 0/25 [00:00<?, ?it/s]

36 stores


Parsing Locations in ut:   0%|          | 0/18 [00:00<?, ?it/s]

20 stores


Parsing Locations in tn:   0%|          | 0/27 [00:00<?, ?it/s]

35 stores


Parsing Locations in wy:   0%|          | 0/1 [00:00<?, ?it/s]

1 stores


Parsing Locations in ny:   0%|          | 0/50 [00:00<?, ?it/s]

https://restaurants.ihop.com/en-us/ny/brooklyn/breakfast-1019-surf-ave-2132 'NoneType' object has no attribute 'text'
60 stores


Parsing Locations in ks:   0%|          | 0/20 [00:00<?, ?it/s]

30 stores


Parsing Locations in ak:   0%|          | 0/2 [00:00<?, ?it/s]

4 stores


Parsing Locations in nv:   0%|          | 0/5 [00:00<?, ?it/s]

23 stores


Parsing Locations in il:   0%|          | 0/38 [00:00<?, ?it/s]

44 stores


Parsing Locations in vt:   0%|          | 0/51 [00:00<?, ?it/s]

Parsing Locations in mt:   0%|          | 0/4 [00:00<?, ?it/s]

4 stores


Parsing Locations in ia:   0%|          | 0/11 [00:00<?, ?it/s]

11 stores


Parsing Locations in sc:   0%|          | 0/23 [00:00<?, ?it/s]

30 stores


Parsing Locations in nh:   0%|          | 0/4 [00:00<?, ?it/s]

4 stores


Parsing Locations in az:   0%|          | 0/21 [00:00<?, ?it/s]

42 stores


Parsing Locations in dc:   0%|          | 0/1 [00:00<?, ?it/s]

2 stores


Parsing Locations in as:   0%|          | 0/51 [00:00<?, ?it/s]

Parsing Locations in vi:   0%|          | 0/51 [00:00<?, ?it/s]

Parsing Locations in nj:   0%|          | 0/45 [00:00<?, ?it/s]

49 stores


Parsing Locations in md:   0%|          | 0/51 [00:00<?, ?it/s]

54 stores


Parsing Locations in me:   0%|          | 0/5 [00:00<?, ?it/s]

5 stores


Parsing Locations in hi:   0%|          | 0/5 [00:00<?, ?it/s]

7 stores


Parsing Locations in de:   0%|          | 0/8 [00:00<?, ?it/s]

8 stores


Parsing Locations in gu:   0%|          | 0/51 [00:00<?, ?it/s]

Parsing Locations in mp:   0%|          | 0/51 [00:00<?, ?it/s]

Parsing Locations in ri:   0%|          | 0/5 [00:00<?, ?it/s]

5 stores


Parsing Locations in ky:   0%|          | 0/11 [00:00<?, ?it/s]

12 stores


Parsing Locations in oh:   0%|          | 0/32 [00:00<?, ?it/s]

40 stores


Parsing Locations in wi:   0%|          | 0/18 [00:00<?, ?it/s]

18 stores


Parsing Locations in or:   0%|          | 0/13 [00:00<?, ?it/s]

14 stores


Parsing Locations in nd:   0%|          | 0/3 [00:00<?, ?it/s]

3 stores


Parsing Locations in ar:   0%|          | 0/16 [00:00<?, ?it/s]

17 stores


Parsing Locations in in:   0%|          | 0/21 [00:00<?, ?it/s]

26 stores


Parsing Locations in mn:   0%|          | 0/8 [00:00<?, ?it/s]

8 stores


Parsing Locations in ct:   0%|          | 0/11 [00:00<?, ?it/s]

10 stores


In [39]:
manually_added_records = gpd.GeoDataFrame(
    [
        {"STUSPS": "nc", "geometry": Point(-79.735968, 34.915083)},
        {"STUSPS": "ny", "geometry": Point(-73.97880764438892, 40.57569919305785)},
    ],
    crs=4326,
)

In [40]:
ihop_state_gdfs = []
states_path = "data/states"
for file in os.listdir("data/states"):
    constructed_path = os.path.join(states_path, file)
    if constructed_path.endswith(".gpkg"):
        ihop_state_gdf = gpd.read_file(constructed_path)
        ihop_state_gdfs.append(ihop_state_gdf)
ihop_state_gdfs.append(manually_added_records)

In [41]:
ihop_state_gdf = gpd.GeoDataFrame(pd.concat(ihop_state_gdfs, ignore_index=True))

In [42]:
ihop_state_gdf["STUSPS"] = ihop_state_gdf["STUSPS"].str.upper()

In [43]:
ihop_state_gdf = ihop_state_gdf.to_crs(9311)
ihop_state_gdf.to_file(f"data/ihops.gpkg")

## Get Population Data

In [44]:
state_populations = pd.read_excel(
    "data/NST-EST2024-POP.xlsx", sheet_name=None, engine="openpyxl"
)

In [45]:
state_populations_df = state_populations["NST-EST2024-POP"][
    [
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)",
        "Unnamed: 5",
    ]
]
state_populations_df = state_populations_df.rename(
    columns={
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)": "NAME",
        "Unnamed: 5": "POPULATION",
    }
)
state_populations_df["NAME"] = state_populations_df["NAME"].str[1:]

In [46]:
states_with_population_df = states_df.merge(state_populations_df, on="NAME", how="left")
states_with_population_df = states_with_population_df[
    ["STUSPS", "NAME", "POPULATION", "geometry"]
]

In [47]:
states_with_population_df = states_with_population_df.dropna()

## Merge Data

In [48]:
ihop_state_counts_df = pd.DataFrame(
    ihop_state_gdf.groupby("STUSPS").size(), columns=["IHOPs"]
).reset_index()

In [50]:
ihop_state_counts_pop_df = states_with_population_df.merge(
    ihop_state_counts_df, on="STUSPS", how="left"
)
ihop_state_counts_pop_df = ihop_state_counts_pop_df.fillna(0)

In [51]:
ihop_state_counts_pop_df["per_1000"] = ihop_state_counts_pop_df["IHOPs"] / (
    ihop_state_counts_pop_df["POPULATION"] / 1000
)
ihop_state_counts_pop_df["per_10k"] = ihop_state_counts_pop_df["IHOPs"] / (
    ihop_state_counts_pop_df["POPULATION"] / 10_000
)
ihop_state_counts_pop_df["per_100k"] = ihop_state_counts_pop_df["IHOPs"] / (
    ihop_state_counts_pop_df["POPULATION"] / 100000
)
ihop_state_counts_pop_df["per_500k"] = ihop_state_counts_pop_df["IHOPs"] / (
    ihop_state_counts_pop_df["POPULATION"] / 500_000
)
ihop_state_counts_pop_df["per_1m"] = ihop_state_counts_pop_df["IHOPs"] / (
    ihop_state_counts_pop_df["POPULATION"] / 1_000_000
)
ihop_state_counts_pop_df["per_capita"] = (
    ihop_state_counts_pop_df["POPULATION"] / ihop_state_counts_pop_df["IHOPs"]
)

In [52]:
ihop_state_counts_pop_df = ihop_state_counts_pop_df.to_crs(9311)

In [53]:
ihop_state_counts_pop_df.to_file("data/IHOPs_per_state.gpkg")