In [1]:
import geopandas as gpd
import pandas as pd

## Open State data

In [2]:
file_path = "data/cb_2024_us_county_500k/cb_2024_us_county_500k.shp"
counties_gdf = gpd.read_file(file_path)

## Get Ethnicity amounts

In [26]:
acs_ethnicity = pd.read_csv(
    "data/ACSDT5Y2023.B04006_2025-05-02T185520/ACSDT5Y2023.B04006-Data.csv",
    low_memory=False,
)

#### Get Column names

In [49]:
acs_column_names = pd.read_csv(
    "data/ACSDT5Y2023.B04006_2025-05-02T185520/ACSDT5Y2023.B04006-Column-Metadata.csv"
)

In [50]:
ancestry_tied_to_country = [
    "Carpatho Rusyn",
    "Croatian",
    "Czech",
    "Czechoslovakian",
    "Eastern European",
    "Macedonian",
    "Polish",
    "Russian",
    "Serbian",
    "Slavic",
    "Slovak",
    "Slovene",
    "Soviet Union",
    "Ukrainian",
    "Yugoslavian",
]

In [51]:
acs_column_names = acs_column_names[
    acs_column_names["Label"].str.split("!!").str[2].isin(ancestry_tied_to_country)
]
acs_column_names = acs_column_names[
    acs_column_names["Label"].str.split("!!").str[0] == "Estimate"
]
acs_column_names["ETHNICITY"] = acs_column_names["Label"].str.split("!!").str[2]

In [52]:
ethnicity_cols = list(acs_column_names["Column Name"]) + ["B04006_001E"]

#### Back to Data

In [55]:
rename_cols = {
    "Estimate!!Total:!!Carpatho Rusyn": "CARPATHO RUSYN",
    "Estimate!!Total:!!Croatian": "CROATIAN",
    "Estimate!!Total:!!Czech": "CZECH",
    "Estimate!!Total:!!Czechoslovakian": "CZECHOSLOVAKIAN",
    "Estimate!!Total:!!Eastern European": "EASTERN EUROPEAN",
    "Estimate!!Total:!!Macedonian": "MACEDONIAN",
    "Estimate!!Total:!!Polish": "POLISH",
    "Estimate!!Total:!!Russian": "RUSSIAN",
    "Estimate!!Total:!!Serbian": "SERBIAN",
    "Estimate!!Total:!!Slavic": "SLAVIC",
    "Estimate!!Total:!!Slovak": "SLOVAK",
    "Estimate!!Total:!!Slovene": "SLOVENE",
    "Estimate!!Total:!!Soviet Union": "SOVIET UNION",
    "Estimate!!Total:!!Ukrainian": "UKRAINIAN",
    "Estimate!!Total:!!Yugoslavian": "YUGOSLAVIAN",
    "Geography": "GEOIDFQ",
    "Estimate!!Total:": "TOTAL",
}

In [56]:
ethnicity_per_county_df = acs_ethnicity[["GEO_ID", *ethnicity_cols]]
ethnicity_per_county_df.columns = ethnicity_per_county_df.iloc[0]
ethnicity_per_county_df = ethnicity_per_county_df[1:]
ethnicity_per_county_df = ethnicity_per_county_df.rename(columns=rename_cols)

In [57]:
ancestry_cols = list(rename_cols.values())[:-2]

In [58]:
ethnicity_per_county_df[ancestry_cols] = ethnicity_per_county_df[ancestry_cols].astype(
    int
)

In [59]:
ethnicity_per_county_df["ancestry_total"] = ethnicity_per_county_df[ancestry_cols].sum(
    axis=1
)
ethnicity_per_county_df["ancestry_total_percent"] = ethnicity_per_county_df[
    "ancestry_total"
] / ethnicity_per_county_df["TOTAL"].astype(int)

## Merge Data

In [72]:
ethnicity_per_county_df["ancestry_countries_1_percent"] = ethnicity_per_county_df[
    ethnicity_per_county_df["ancestry_total_percent"] >= 0.01
][ancestry_cols].idxmax(axis=1)
ethnicity_per_county_df["ancestry_countries_2_percent"] = ethnicity_per_county_df[
    ethnicity_per_county_df["ancestry_total_percent"] >= 0.02
][ancestry_cols].idxmax(axis=1)
ethnicity_per_county_df["ancestry_countries_3_percent"] = ethnicity_per_county_df[
    ethnicity_per_county_df["ancestry_total_percent"] >= 0.03
][ancestry_cols].idxmax(axis=1)
ethnicity_per_county_df["ancestry_countries_4_percent"] = ethnicity_per_county_df[
    ethnicity_per_county_df["ancestry_total_percent"] >= 0.04
][ancestry_cols].idxmax(axis=1)
ethnicity_per_county_df["ancestry_countries_5_percent"] = ethnicity_per_county_df[
    ethnicity_per_county_df["ancestry_total_percent"] >= 0.05
][ancestry_cols].idxmax(axis=1)

In [73]:
ethnicity_per_county_gdf = counties_gdf.merge(
    ethnicity_per_county_df, on="GEOIDFQ", how="left"
)

In [74]:
ethnicity_per_county_gdf["county_id"] = ethnicity_per_county_gdf["GEOIDFQ"].str[9:]
ethnicity_per_county_gdf["state_id"] = ethnicity_per_county_gdf["GEOIDFQ"].str[9:11]

In [75]:
ethnicity_per_county_gdf = ethnicity_per_county_gdf.to_crs(9311)
ethnicity_per_county_gdf.to_file("data/slavic_ancestry_per_county.gpkg")

In [76]:
ethnicity_per_county_gdf.groupby("ancestry_countries_3_percent").size()

ancestry_countries_3_percent
CROATIAN              3
CZECH               186
CZECHOSLOVAKIAN       2
EASTERN EUROPEAN      5
POLISH              983
RUSSIAN              66
SLOVAK                4
UKRAINIAN            12
YUGOSLAVIAN           1
dtype: int64