In [74]:
import geopandas as gpd
import pandas as pd

## Open GIS Data

In [75]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

In [76]:
def build_df(file_name: str, column_name: str) -> pd.DataFrame:
    acs_ethnicity = pd.read_csv(file_name)
    rename_columns = {}
    for column in list(acs_ethnicity.columns):
        str_split = column.split("!!")
        if len(str_split) == 2:
            if str_split[1] == "Estimate":
                rename_columns[column] = str_split[0].strip()
        else:
            continue

    states = list(rename_columns.values())
    states.remove("Peoples township, Boone County, Iowa")
    rename_columns["Label (Grouping)"] = "group"
    states.append("group")
    acs_ethnicity = acs_ethnicity.rename(columns=rename_columns)
    acs_ethnicity = acs_ethnicity[states]
    states.remove("group")
    acs_ethnicity = acs_ethnicity.T.reset_index()
    acs_ethnicity.columns = acs_ethnicity.iloc[52]
    ## Remove the last record as it's the titles, set then remove.
    acs_ethnicity = acs_ethnicity.drop(acs_ethnicity.index[52])
    states_series = acs_ethnicity["group"]
    acs_ethnicity = acs_ethnicity.loc[:, acs_ethnicity.columns != "group"].apply(
        lambda col: pd.to_numeric(col.str.replace(",", ""), errors="coerce"), axis=1
    )
    acs_ethnicity["NAME"] = states_series
    rename_columns = {col: col.strip() for col in list(acs_ethnicity.columns)}
    rename_columns["Total:"] = "TOTAL"
    acs_ethnicity = acs_ethnicity.rename(columns=rename_columns)
    pg_df = acs_ethnicity[["NAME", "TOTAL", column_name]]
    pg_df["PERCENT"] = (
        (acs_ethnicity[column_name] / acs_ethnicity["TOTAL"]) * 100
    ).round(decimals=3)
    pg_df["per_100k"] = (pg_df[column_name] / (pg_df["TOTAL"] / 100_000)).astype(int)
    return pg_df

## Get 2022 Ethnic Data

In [77]:
pg_2022_df = build_df("data/ACSDT5Y2022.B04006-2025-03-26T204306.csv", "Ukrainian")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pg_df['PERCENT'] = ((acs_ethnicity[column_name] / acs_ethnicity['TOTAL'])*100).round(decimals=3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pg_df["per_100k"] = (pg_df[column_name] / (pg_df['TOTAL'] / 100_000)).astype(int)


In [78]:
pg_2023_df = build_df("data/ACSDT5Y2023.B04006-2025-03-26T203127.csv", "Ukrainian")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pg_df['PERCENT'] = ((acs_ethnicity[column_name] / acs_ethnicity['TOTAL'])*100).round(decimals=3)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pg_df["per_100k"] = (pg_df[column_name] / (pg_df['TOTAL'] / 100_000)).astype(int)


In [79]:
pg_2022_2023_df = pg_2022_df.merge(
    pg_2023_df, on="NAME", how="inner", suffixes=("_2022", "_2023")
)

In [80]:
pg_2022_2023_df["population_change"] = (
    pg_2022_2023_df["Ukrainian_2023"] - pg_2022_2023_df["Ukrainian_2022"]
)
pg_2022_2023_df["per_100k_change"] = (
    pg_2022_2023_df["per_100k_2023"] - pg_2022_2023_df["per_100k_2022"]
)

## Merge Data

In [81]:
pg_per_state_df = states_df[["geometry", "NAME"]].merge(
    pg_2022_2023_df, on="NAME", how="left"
)

In [82]:
pg_per_state_df = pg_per_state_df.to_crs(9311)

In [83]:
pg_per_state_df.to_file("data/pg.gpkg")