In [22]:
import geopandas as gpd
import pandas as pd
import wikipedia as wp

## Open State data

In [23]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

## Open Population Data

In [24]:
state_populations = pd.read_excel(
    "data/NST-EST2024-POP.xlsx", sheet_name=None, engine="openpyxl"
)

In [25]:
state_populations_df = state_populations["NST-EST2024-POP"][
    [
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)",
        "Unnamed: 5",
    ]
]
state_populations_df = state_populations_df.rename(
    columns={
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)": "NAME",
        "Unnamed: 5": "POPULATION",
    }
)
state_populations_df["NAME"] = state_populations_df["NAME"].str[1:]

In [26]:
states_with_population_df = states_df.merge(state_populations_df, on="NAME", how="left")
states_with_population_df = states_with_population_df[
    ["STUSPS", "NAME", "POPULATION", "geometry"]
]

## Get State Legislator Data

In [28]:
html = (
    wp.page("List_of_United_States_state_legislatures", auto_suggest=False)
    .html()
    .encode("UTF-8")
)

In [29]:
legislature_df = pd.read_html(html)[3]

In [30]:
lower_sizes = pd.to_numeric(
    legislature_df[("Lower house", "Size [2]")], errors="coerce"
)
upper_sizes = pd.to_numeric(
    legislature_df[("Upper house", "Size [2]")], errors="coerce"
)
legislature_df["LEGISLATORS"] = (lower_sizes.fillna(0) + upper_sizes.fillna(0)).astype(
    int
)

In [31]:
legislature_df = legislature_df[["LEGISLATORS", "State"]]
legislature_df = legislature_df.rename(columns={"State": "NAME"})

In [34]:
legislature_df.columns = [
    col[1] if col[1] else col[0] for col in legislature_df.columns
]

## Merge Data

In [43]:
state_legislators_with_pop_gdf = states_with_population_df.merge(
    legislature_df, on="NAME", how="right"
).dropna()

In [44]:
state_legislators_with_pop_gdf["PEOPLE_PER_LEGISLATOR"] = (
    (
        state_legislators_with_pop_gdf["POPULATION"]
        / state_legislators_with_pop_gdf["LEGISLATORS"]
    )
    .round(decimals=0)
    .astype(int)
)

In [45]:
state_legislators_with_pop_gdf = state_legislators_with_pop_gdf.to_crs(9311)
state_legislators_with_pop_gdf.to_file("data/state_legislator.gpkg")