In [1]:
from io import StringIO

In [None]:
import geopandas as gpd
import pandas as pd

## Open State data

In [3]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

## Open Population Data

In [4]:
state_populations = pd.read_excel(
    "data/NST-EST2024-POP.xlsx", sheet_name=None, engine="openpyxl"
)

In [5]:
state_populations_df = state_populations["NST-EST2024-POP"][
    [
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)",
        "Unnamed: 5",
    ]
]
state_populations_df = state_populations_df.rename(
    columns={
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)": "NAME",
        "Unnamed: 5": "POPULATION",
    }
)
state_populations_df["NAME"] = state_populations_df["NAME"].str[1:]

In [6]:
states_with_population_df = states_df.merge(state_populations_df, on="NAME", how="left")
states_with_population_df = states_with_population_df[
    ["STUSPS", "NAME", "POPULATION", "geometry"]
]

## Get Corn Data

In [7]:
with open("data/cropan25/crop_p10_t145.csv", encoding="cp1252") as f:
    lines = f.readlines()

In [8]:
data_lines = [line for line in lines if line.startswith('145,"d"')]
data_lines = [
    line
    for line in data_lines
    if not line.split(",")[2].strip('"').strip().startswith('""')
    and line.split(",")[2].strip('"').strip() != ""
]

In [9]:
header = [
    "series_id",
    "type",
    "NAME",
    "yield_2022",
    "yield_2023",
    "yield_2024",
    "prod_2022",
    "prod_2023",
    "prod_2024",
]

In [10]:
csv_data = "\n".join(data_lines)
corn_df = pd.read_csv(StringIO(csv_data), names=header, na_values=["(NA)"])

In [11]:
corn_df.drop(columns=["series_id", "type"], inplace=True)
corn_df["NAME"] = corn_df["NAME"].str.replace(
    r"\s*\d+/\s*", "", regex=True
)  # remove " 1/", " 2/" etc.
corn_df["NAME"] = corn_df["NAME"].str.replace(
    r"[^\w\s-]", "", regex=True
)  # remove stray punctuation/symbols
corn_df["NAME"] = corn_df["NAME"].str.strip()

In [12]:
for col in corn_df.columns[1:]:  # skip 'state'
    corn_df[col] = pd.to_numeric(corn_df[col], errors="coerce")

In [13]:
corn_df["prod_2022"] = corn_df["prod_2022"] * 1000
corn_df["prod_2023"] = corn_df["prod_2023"] * 1000
corn_df["prod_2024"] = corn_df["prod_2024"] * 1000

## Merge Data

In [14]:
corn_gdf = states_with_population_df.merge(corn_df, on="NAME", how="left").fillna(0)

In [15]:
corn_gdf["per_100"] = corn_gdf["prod_2024"] / (corn_gdf["POPULATION"] / 100)
corn_gdf["per_1000"] = corn_gdf["prod_2024"] / (corn_gdf["POPULATION"] / 1_000)
corn_gdf["per_100k"] = corn_gdf["prod_2024"] / (corn_gdf["POPULATION"] / 100_000)
corn_gdf["per_1m"] = corn_gdf["prod_2024"] / (corn_gdf["POPULATION"] / 1_000_000)

In [16]:
corn_gdf = corn_gdf.to_crs(9311)
corn_gdf.to_file("data/corn_per_state.gpkg")