In [1]:
import geopandas as gpd
import pandas as pd

## Open GIS Data

In [2]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

## Get Ethnic Data

In [3]:
acs_ethnicity = pd.read_csv("data/ACSDT5Y2023.B04006-2025-03-26T203127.csv")

In [4]:
rename_columns = {}
for column in list(acs_ethnicity.columns):
    str_split = column.split("!!")
    if len(str_split) == 2:
        if str_split[1] == "Estimate":
            rename_columns[column] = str_split[0].strip()
    else:
        continue

In [5]:
states = list(rename_columns.values())
states.remove("Peoples township, Boone County, Iowa")

In [6]:
rename_columns["Label (Grouping)"] = "group"
states.append("group")
acs_ethnicity = acs_ethnicity.rename(columns=rename_columns)

In [7]:
acs_ethnicity = acs_ethnicity[states]
states.remove("group")

#### Remove the last record as it's the titles, set then remove.

In [8]:
acs_ethnicity = acs_ethnicity.T.reset_index()
acs_ethnicity.columns = acs_ethnicity.iloc[52]
acs_ethnicity = acs_ethnicity.drop(acs_ethnicity.index[52])

In [9]:
states_series = acs_ethnicity["group"]

In [10]:
acs_ethnicity = acs_ethnicity.loc[:, acs_ethnicity.columns != "group"].apply(
    lambda col: pd.to_numeric(col.str.replace(",", ""), errors="coerce"), axis=1
)

In [11]:
acs_ethnicity["NAME"] = states_series

In [12]:
rename_columns = {col: col.strip() for col in list(acs_ethnicity.columns)}
acs_ethnicity = acs_ethnicity.rename(columns=rename_columns)

## Get Difference Data

In [13]:
df = acs_ethnicity[["NAME", "Total:", "Czech", "Slovak"]]

In [14]:
def more(row) -> str:
    if row["Czech"] > row["Slovak"]:
        return "More Czech"
    else:
        return "More Slovak"

In [15]:
df["more"] = df.apply(lambda row: more(row), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["more"] = df.apply(lambda row: more(row), axis=1)


In [16]:
df[df["more"] == "More Slovak"]

52,NAME,Total:,Czech,Slovak,more
6,Connecticut,3598348,9020,15735,More Slovak
7,Delaware,1005872,1888,1977,More Slovak
14,Indiana,6811752,14467,16607,More Slovak
30,New Jersey,9267014,20305,27859,More Slovak
35,Ohio,11780046,49352,103582,More Slovak
38,Pennsylvania,12986518,40647,162511,More Slovak
48,West Virginia,1784462,2671,3739,More Slovak


## Merge Data

In [17]:
diff_gdf = states_df[["geometry", "NAME"]].merge(df, on="NAME", how="left")

In [18]:
diff_gdf = diff_gdf.to_crs(9311)

In [19]:
diff_gdf.to_file("data/diff.gpkg")