In [1]:
import geopandas as gpd
import pandas as pd

## Open GIS Data

In [2]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

## Get Ethnic Data

In [3]:
acs_ethnicity = pd.read_csv("data/ACSDT5Y2023.B04006-2025-03-26T203127.csv")

In [4]:
rename_columns = {}
for column in list(acs_ethnicity.columns):
    str_split = column.split("!!")
    if len(str_split) == 2:
        if str_split[1] == "Estimate":
            rename_columns[column] = str_split[0].strip()
    else:
        continue

In [5]:
states = list(rename_columns.values())
states.remove("Peoples township, Boone County, Iowa")

In [6]:
rename_columns["Label (Grouping)"] = "group"
states.append("group")
acs_ethnicity = acs_ethnicity.rename(columns=rename_columns)

In [7]:
acs_ethnicity = acs_ethnicity[states]
states.remove("group")

#### Remove the last record as it's the titles, set then remove.

In [8]:
acs_ethnicity = acs_ethnicity.T.reset_index()
acs_ethnicity.columns = acs_ethnicity.iloc[52]
acs_ethnicity = acs_ethnicity.drop(acs_ethnicity.index[52])

In [9]:
states_series = acs_ethnicity["group"]

In [10]:
acs_ethnicity = acs_ethnicity.loc[:, acs_ethnicity.columns != "group"].apply(
    lambda col: pd.to_numeric(col.str.replace(",", ""), errors="coerce"), axis=1
)

In [11]:
acs_ethnicity["NAME"] = states_series

In [12]:
rename_columns = {col: col.strip() for col in list(acs_ethnicity.columns)}
acs_ethnicity = acs_ethnicity.rename(columns=rename_columns)

## Get Difference Data

In [23]:
ancestry_tied_to_country = [
    "Carpatho Rusyn",
    "Croatian",
    "Czech",
    "Czechoslovakian",
    "Eastern European",
    "Macedonian",
    "Polish",
    "Russian",
    "Serbian",
    "Slavic",
    "Slovak",
    "Slovene",
    "Soviet Union",
    "Ukrainian",
    "Yugoslavian",
]

In [24]:
df = acs_ethnicity[["NAME", "Total:", *ancestry_tied_to_country]]
df["ancestry_countries"] = df[ancestry_tied_to_country].idxmax(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["ancestry_countries"] = df[ancestry_tied_to_country].idxmax(axis=1)


In [25]:
df

52,NAME,Total:,Carpatho Rusyn,Croatian,Czech,Czechoslovakian,Eastern European,Macedonian,Polish,Russian,Serbian,Slavic,Slovak,Slovene,Soviet Union,Ukrainian,Yugoslavian,ancestry_countries
0,Alabama,5054253,7,1639,4742,1167,4243,214,28254,6797,374,840,1920,269,0,3340,974,Polish
1,Alaska,733971,15,781,3243,762,1627,272,12526,6332,646,585,746,252,0,2530,383,Polish
2,Arizona,7268175,108,9758,26194,6617,18251,867,153015,44166,6755,2996,8542,3326,67,16310,6479,Polish
3,Arkansas,3032651,0,665,6329,1500,2993,17,22023,3767,271,309,918,180,0,1639,230,Polish
4,California,39242785,327,45170,72745,18829,116648,3280,439900,332619,18520,12132,23059,10248,547,125459,19383,Polish
5,Colorado,5810774,124,7528,34584,5949,22091,669,137348,51198,2903,3783,8112,6630,136,17349,3522,Polish
6,Connecticut,3598348,189,3989,9020,3764,18740,1214,229189,46040,1331,2125,15735,1339,0,21758,4964,Polish
7,Delaware,1005872,24,528,1888,669,3444,53,39973,6604,234,336,1977,284,44,4986,252,Polish
8,District of Columbia,672079,2,1017,2174,278,6964,137,15045,9080,415,169,1350,392,6,3657,65,Polish
9,Florida,21928881,437,18429,48916,14458,67193,2856,475665,185401,10837,8529,30254,7799,146,67565,13662,Polish


## Merge Data

In [26]:
diff_gdf = states_df[["geometry", "NAME"]].merge(df, on="NAME", how="left")

In [27]:
diff_gdf = diff_gdf.to_crs(9311)

In [28]:
diff_gdf.to_file("data/diff.gpkg")