In [13]:
import geopandas as gpd
import pandas as pd

## Open GIS Data

In [14]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

## Get Ethnic Data

In [15]:
acs_ethnicity = pd.read_csv("data/ACSDT5Y2023.B04006-2025-03-26T203127.csv")

In [16]:
rename_columns = {}
for column in list(acs_ethnicity.columns):
    str_split = column.split("!!")
    if len(str_split) == 2:
        if str_split[1] == "Estimate":
            rename_columns[column] = str_split[0].strip()
    else:
        continue

In [17]:
states = list(rename_columns.values())
states.remove("Peoples township, Boone County, Iowa")

In [18]:
rename_columns["Label (Grouping)"] = "group"
states.append("group")
acs_ethnicity = acs_ethnicity.rename(columns=rename_columns)

In [19]:
acs_ethnicity = acs_ethnicity[states]
states.remove("group")

#### Remove the last record as it's the titles, set then remove.

In [20]:
acs_ethnicity = acs_ethnicity.T.reset_index()
acs_ethnicity.columns = acs_ethnicity.iloc[52]
acs_ethnicity = acs_ethnicity.drop(acs_ethnicity.index[52])

In [21]:
states_series = acs_ethnicity["group"]

In [22]:
acs_ethnicity = acs_ethnicity.loc[:, acs_ethnicity.columns != "group"].apply(
    lambda col: pd.to_numeric(col.str.replace(",", ""), errors="coerce"), axis=1
)

In [23]:
acs_ethnicity["NAME"] = states_series

In [24]:
rename_columns = {col: col.strip() for col in list(acs_ethnicity.columns)}
acs_ethnicity = acs_ethnicity.rename(columns=rename_columns)

## Get Difference Data

In [29]:
ancestry_tied_to_country = [
    "Bahamian",
    "Barbadian",
    "Belizean",
    "Bermudan",
    "British West Indian",
    "Dutch West Indian",
    "Haitian",
    "Jamaican",
    "Trinidadian and Tobagonian",
    "U.S. Virgin Islander",
    "West Indian",
    "Other West Indian",
]

In [30]:
df = acs_ethnicity[["NAME", "Total:", *ancestry_tied_to_country]]
df["ancestry_countries"] = df[ancestry_tied_to_country].idxmax(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["ancestry_countries"] = df[ancestry_tied_to_country].idxmax(axis=1)


In [31]:
df

52,NAME,Total:,Bahamian,Barbadian,Belizean,Bermudan,British West Indian,Dutch West Indian,Haitian,Jamaican,Trinidadian and Tobagonian,U.S. Virgin Islander,West Indian,Other West Indian,ancestry_countries
0,Alabama,5054253,511,139,157,459,403,159,2605,6031,709,97,2236,20,Jamaican
1,Alaska,733971,61,27,61,0,18,70,472,551,109,0,40,0,Jamaican
2,Arizona,7268175,490,694,917,37,357,192,2607,7283,1861,164,2160,220,Jamaican
3,Arkansas,3032651,86,19,125,93,54,1056,726,1024,486,29,880,172,Dutch West Indian
4,California,39242785,822,2403,21788,321,2454,1337,14287,39575,7012,619,9608,735,Jamaican
5,Colorado,5810774,52,506,468,21,364,186,1555,5347,676,158,1312,23,Jamaican
6,Connecticut,3598348,425,2175,235,76,2892,130,21797,61857,3886,356,4991,193,Jamaican
7,Delaware,1005872,16,583,27,132,405,38,6275,6696,1292,41,1018,0,Jamaican
8,District of Columbia,672079,85,164,181,95,540,0,1795,6534,1921,112,1618,140,Jamaican
9,Florida,21928881,30944,7273,4914,988,17688,2101,508229,312901,39962,8033,36335,1361,Haitian


## Merge Data

In [28]:
diff_gdf = states_df[["geometry", "NAME"]].merge(df, on="NAME", how="left")

In [29]:
diff_gdf = diff_gdf.to_crs(9311)

In [30]:
diff_gdf.to_file("data/diff.gpkg")