In [35]:
import re
import requests

In [36]:
from bs4 import BeautifulSoup
import geopandas as gpd
import pandas as pd

## Open GIS Data

In [37]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

## Get Population Data

In [52]:
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
}

In [53]:
r = requests.get("https://worldpopulationreview.com/states", headers=headers)

In [54]:
soup = BeautifulSoup(r.text, "html.parser")
tbodys = soup.find_all("tbody", {"class": "relative z-10 text-sm"})
population_table = tbodys[0]

In [55]:
population_list = []
for tr in population_table.find_all("tr"):
    tds = tr.find_all("td")
    population_dict = {
        "NAME": tr.find("a").text,
        "rank": tds[0].text,
        "pop2024": int(tds[1].text.replace(",", "")),
        "pop2023": int(tds[2].text.replace(",", "")),
        "change": float(tds[3].find("span").text.replace("%", "")),
    }
    population_list.append(population_dict)

for tr in tbodys[1].find_all("tr"):
    tds = tr.find_all("td")
    population_dict = {
        "NAME": tr.find("a").text,
        "pop2024": int(tds[1].text.replace(",", "")),
        "pop2023": int(tds[2].text.replace(",", "")),
        "change": float(tds[3].find("span").text.replace("%", "")),
    }
    population_list.append(population_dict)

In [56]:
population_df = pd.DataFrame(population_list, columns=["NAME", "pop2024"])

## Open Government data

In [57]:
pattern = re.compile(r"(.+?)\s+(\d+)$")

In [58]:
data_dict = {}
with open("data/data.txt", "r") as file:
    for line in file:
        match = pattern.match(line.strip())
        if match:
            key = match.group(1)  # All text before the last number
            value = match.group(2)  # The number at the end
            data_dict[key] = int(value)

In [59]:
fed_workers_df = pd.DataFrame(data_dict.items(), columns=["NAME", "WORKERS"])

## Merge data

In [84]:
state_population_gdf = states_df.merge(population_df, on="NAME", how="inner")[
    ["geometry", "NAME", "pop2024"]
]

In [85]:
state_population_fed_workers_gdf = state_population_gdf.merge(
    fed_workers_df, on="NAME", how="inner"
)

In [86]:
state_population_fed_workers_gdf["PERCENT_FEDERAL_WORKERS"] = (
    state_population_fed_workers_gdf["WORKERS"]
    / state_population_fed_workers_gdf["pop2024"]
) * 100

In [97]:
state_population_fed_workers_gdf["per_1000"] = (
    (
        state_population_fed_workers_gdf["WORKERS"]
        / (state_population_fed_workers_gdf["pop2024"] / 1000)
    )
    .round(decimals=0)
    .astype(int)
)
state_population_fed_workers_gdf["per_10k"] = (
    (
        state_population_fed_workers_gdf["WORKERS"]
        / (state_population_fed_workers_gdf["pop2024"] / 10_000)
    )
    .round(decimals=0)
    .astype(int)
)
state_population_fed_workers_gdf["per_100k"] = (
    (
        state_population_fed_workers_gdf["WORKERS"]
        / (state_population_fed_workers_gdf["pop2024"] / 100000)
    )
    .round(decimals=0)
    .astype(int)
)
state_population_fed_workers_gdf["per_500k"] = (
    (
        state_population_fed_workers_gdf["WORKERS"]
        / (state_population_fed_workers_gdf["pop2024"] / 500_000)
    )
    .round(decimals=0)
    .astype(int)
)
state_population_fed_workers_gdf["per_1m"] = (
    (
        state_population_fed_workers_gdf["WORKERS"]
        / (state_population_fed_workers_gdf["pop2024"] / 1_000_000)
    )
    .round(decimals=0)
    .astype(int)
)
state_population_fed_workers_gdf["people_per_federal_worker"] = (
    (
        state_population_fed_workers_gdf["pop2024"]
        / state_population_fed_workers_gdf["WORKERS"]
    )
    .round(decimals=0)
    .astype(int)
)

In [100]:
state_population_fed_workers_gdf = state_population_fed_workers_gdf.to_crs(9311)

In [101]:
state_population_fed_workers_gdf.to_file("data/federal_workers.gpkg")