In [1]:
import geopandas as gpd
import pandas as pd

## Open GIS Data

In [2]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

## Get Ethnic Data

In [3]:
acs_ethnicity = pd.read_csv("data/ACSDT5Y2023.B04006-2025-03-26T203127.csv")

In [4]:
rename_columns = {}
for column in list(acs_ethnicity.columns):
    str_split = column.split("!!")
    if len(str_split) == 2:
        if str_split[1] == "Estimate":
            rename_columns[column] = str_split[0].strip()
    else:
        continue

In [5]:
states = list(rename_columns.values())
states.remove("Peoples township, Boone County, Iowa")

In [6]:
rename_columns["Label (Grouping)"] = "group"
states.append("group")
acs_ethnicity = acs_ethnicity.rename(columns=rename_columns)

In [7]:
acs_ethnicity = acs_ethnicity[states]
states.remove("group")

#### Remove the last record as it's the titles, set then remove.

In [8]:
acs_ethnicity = acs_ethnicity.T.reset_index()
acs_ethnicity.columns = acs_ethnicity.iloc[52]
acs_ethnicity = acs_ethnicity.drop(acs_ethnicity.index[52])

In [9]:
states_series = acs_ethnicity["group"]

In [10]:
acs_ethnicity = acs_ethnicity.loc[:, acs_ethnicity.columns != "group"].apply(
    lambda col: pd.to_numeric(col.str.replace(",", ""), errors="coerce"), axis=1
)

In [11]:
acs_ethnicity["NAME"] = states_series

In [12]:
rename_columns = {col: col.strip() for col in list(acs_ethnicity.columns)}
acs_ethnicity = acs_ethnicity.rename(columns=rename_columns)

## Get Difference Data

In [25]:
ancestry_tied_to_country = [
    "Egyptian",
    "Iraqi",
    "Jordanian",
    "Lebanese",
    "Moroccan",
    "Palestinian",
    "Syrian",
    "Arab",
    "Other Arab",
]

In [26]:
df = acs_ethnicity[["NAME", "Total:", *ancestry_tied_to_country]]
df["ancestry_countries"] = df[ancestry_tied_to_country].idxmax(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["ancestry_countries"] = df[ancestry_tied_to_country].idxmax(axis=1)


In [27]:
df

52,NAME,Total:,Egyptian,Iraqi,Jordanian,Lebanese,Moroccan,Palestinian,Syrian,Arab,Other Arab,ancestry_countries
0,Alabama,5054253,719,188,136,5514,1443,1386,832,1106,2409,Lebanese
1,Alaska,733971,165,5,0,378,95,13,296,271,102,Lebanese
2,Arizona,7268175,3987,7171,2153,10572,1114,1677,2606,3642,5855,Lebanese
3,Arkansas,3032651,742,163,123,1480,240,341,542,505,1221,Lebanese
4,California,39242785,58473,26919,15011,67091,10940,27967,32023,37891,69813,Other Arab
5,Colorado,5810774,2004,1744,1023,7865,2352,2081,2103,2811,4448,Lebanese
6,Connecticut,3598348,2252,529,506,8930,2658,696,2294,2097,4318,Lebanese
7,Delaware,1005872,923,5,89,797,579,161,165,782,1274,Other Arab
8,District of Columbia,672079,564,468,108,2383,798,466,846,409,1141,Lebanese
9,Florida,21928881,19594,3890,4487,45267,14352,11417,16574,12652,17213,Lebanese


## Merge Data

In [28]:
diff_gdf = states_df[["geometry", "NAME"]].merge(df, on="NAME", how="left")

In [29]:
diff_gdf = diff_gdf.to_crs(9311)

In [30]:
diff_gdf.to_file("data/diff.gpkg")