In [19]:
from io import StringIO

In [20]:
import geopandas as gpd
import pandas as pd

## Open State data

In [21]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

## Open Population Data

In [22]:
state_populations = pd.read_excel(
    "data/NST-EST2024-POP.xlsx", sheet_name=None, engine="openpyxl"
)

In [23]:
state_populations_df = state_populations["NST-EST2024-POP"][
    [
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)",
        "Unnamed: 5",
    ]
]
state_populations_df = state_populations_df.rename(
    columns={
        "table with row headers in column A and column headers in rows 3 through 4. (leading dots indicate sub-parts)": "NAME",
        "Unnamed: 5": "POPULATION",
    }
)
state_populations_df["NAME"] = state_populations_df["NAME"].str[1:]

In [24]:
states_with_population_df = states_df.merge(state_populations_df, on="NAME", how="left")
states_with_population_df = states_with_population_df[
    ["STUSPS", "NAME", "POPULATION", "geometry"]
]

## Get Rice Data

In [25]:
with open("data/cropan25/crop_p30_t161.csv", encoding="cp1252") as f:
    lines = f.readlines()

In [26]:
data_lines = [line for line in lines if line.startswith('161,"d"')]
data_lines = [
    line
    for line in data_lines
    if not line.split(",")[2].strip('"').strip().startswith('""')
    and line.split(",")[2].strip('"').strip() != ""
]

In [27]:
header = [
    "series_id",
    "type",
    "NAME",
    "yield_2022",
    "yield_2023",
    "yield_2024",
    "prod_2022",
    "prod_2023",
    "prod_2024",
]

In [28]:
csv_data = "\n".join(data_lines)
crop_df = pd.read_csv(StringIO(csv_data), names=header, na_values=["(NA)"])

In [29]:
crop_df.drop(columns=["series_id", "type"], inplace=True)
crop_df["NAME"] = crop_df["NAME"].str.replace(
    r"\s*\d+/\s*", "", regex=True
)  # remove " 1/", " 2/" etc.
crop_df["NAME"] = crop_df["NAME"].str.replace(
    r"[^\w\s-]", "", regex=True
)  # remove stray punctuation/symbols
crop_df["NAME"] = crop_df["NAME"].str.strip()

In [30]:
for col in crop_df.columns[1:]:  # skip 'state'
    crop_df[col] = pd.to_numeric(crop_df[col], errors="coerce")

In [35]:
crop_df = crop_df.iloc[21:27]

In [38]:
crop_df[
    ["yield_2022", "yield_2023", "yield_2024", "prod_2022", "prod_2023", "prod_2024"]
] = crop_df[
    ["yield_2022", "yield_2023", "yield_2024", "prod_2022", "prod_2023", "prod_2024"]
].astype(
    int
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  crop_df[['yield_2022','yield_2023','yield_2024','prod_2022','prod_2023','prod_2024']] = crop_df[['yield_2022','yield_2023','yield_2024','prod_2022','prod_2023','prod_2024']].astype(int)


In [39]:
crop_df

Unnamed: 0,NAME,yield_2022,yield_2023,yield_2024,prod_2022,prod_2023,prod_2024
21,Arkansas,7410,7550,7640,80051,106895,109407
22,California,8770,8540,8530,22103,43827,39588
23,Louisiana,6660,6800,6710,27453,31431,30809
24,Mississippi,7370,7470,7540,6338,8964,11613
25,Missouri,7940,7990,8430,11991,15985,18040
26,Texas,6510,7670,8800,12105,10889,12676


## Merge Data

In [40]:
crop_gdf = states_with_population_df.merge(crop_df, on="NAME", how="left").fillna(0)

In [41]:
crop_gdf = crop_gdf.to_crs(9311)
crop_gdf.to_file("data/rice_per_state.gpkg")