In [39]:
import geopandas as gpd
import pandas as pd

## Open State data

In [40]:
file_path = "data/cb_2024_us_county_500k/cb_2024_us_county_500k.shp"
counties_gdf = gpd.read_file(file_path)

## Get Ethnicity amounts

In [41]:
acs_ethnicity = pd.read_csv(
    "data/ACSDT5Y2023.B04006_2025-05-02T185520/ACSDT5Y2023.B04006-Data.csv",
    low_memory=False,
)

#### Get Column names

In [42]:
acs_column_names = pd.read_csv(
    "data/ACSDT5Y2023.B04006_2025-05-02T185520/ACSDT5Y2023.B04006-Column-Metadata.csv"
)

In [43]:
columns_df = acs_column_names[
    (acs_column_names["Column Name"] == "B04006_001E")
    | (acs_column_names["Column Name"] == "B04006_074E")
    | (acs_column_names["Column Name"] == "B04006_075E")
    | (acs_column_names["Column Name"] == "B04006_076E")
    | (acs_column_names["Column Name"] == "B04006_077E")
    | (acs_column_names["Column Name"] == "B04006_078E")
    | (acs_column_names["Column Name"] == "B04006_079E")
    | (acs_column_names["Column Name"] == "B04006_080E")
    | (acs_column_names["Column Name"] == "B04006_081E")
    | (acs_column_names["Column Name"] == "B04006_082E")
    | (acs_column_names["Column Name"] == "B04006_083E")
    | (acs_column_names["Column Name"] == "B04006_084E")
    | (acs_column_names["Column Name"] == "B04006_085E")
    | (acs_column_names["Column Name"] == "B04006_086E")
    | (acs_column_names["Column Name"] == "B04006_087E")
    | (acs_column_names["Column Name"] == "B04006_088E")
]
columns_df["name"] = columns_df["Label"].str.split("!!").str[-1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  columns_df['name'] = columns_df['Label'].str.split("!!").str[-1]


In [44]:
rename_cols = {
    columns_df.iloc[i]["Label"]: columns_df.iloc[i]["name"]
    for i in range(len(columns_df))
}
rename_cols["Geography"] = "GEOIDFQ"
rename_cols["Estimate!!Total:"] = "TOTAL"

#### Back to Data

In [45]:
ethnicity_per_county_df = acs_ethnicity[["GEO_ID", *list(columns_df["Column Name"])]]
ethnicity_per_county_df.columns = ethnicity_per_county_df.iloc[0]
ethnicity_per_county_df = ethnicity_per_county_df[1:]
ethnicity_per_county_df = ethnicity_per_county_df.rename(columns=rename_cols)

In [46]:
ancestry_cols = list(rename_cols.values())[1:-3]

In [47]:
ethnicity_per_county_df[ancestry_cols] = ethnicity_per_county_df[ancestry_cols].astype(
    int
)

In [48]:
ethnicity_per_county_df["ancestry_total"] = ethnicity_per_county_df[ancestry_cols].sum(
    axis=1
)
ethnicity_per_county_df["ancestry_total_percent"] = ethnicity_per_county_df[
    "ancestry_total"
] / ethnicity_per_county_df["TOTAL"].astype(int)

## Merge Data

In [49]:
ethnicity_per_county_df["ancestry_countries_1_percent"] = ethnicity_per_county_df[
    ethnicity_per_county_df["ancestry_total_percent"] >= 0.01
][ancestry_cols].idxmax(axis=1)
ethnicity_per_county_df["ancestry_countries_2_percent"] = ethnicity_per_county_df[
    ethnicity_per_county_df["ancestry_total_percent"] >= 0.02
][ancestry_cols].idxmax(axis=1)
ethnicity_per_county_df["ancestry_countries_5_percent"] = ethnicity_per_county_df[
    ethnicity_per_county_df["ancestry_total_percent"] >= 0.05
][ancestry_cols].idxmax(axis=1)

In [50]:
len(
    ethnicity_per_county_df[
        ~ethnicity_per_county_df["ancestry_countries_1_percent"].isna()
    ]
)

115

In [51]:
len(
    ethnicity_per_county_df[
        ~ethnicity_per_county_df["ancestry_countries_2_percent"].isna()
    ]
)

42

In [52]:
len(
    ethnicity_per_county_df[
        ~ethnicity_per_county_df["ancestry_countries_5_percent"].isna()
    ]
)

2

In [53]:
ethnicity_per_county_gdf = counties_gdf.merge(
    ethnicity_per_county_df, on="GEOIDFQ", how="left"
)

In [54]:
ethnicity_per_county_gdf["county_id"] = ethnicity_per_county_gdf["GEOIDFQ"].str[9:]
ethnicity_per_county_gdf["state_id"] = ethnicity_per_county_gdf["GEOIDFQ"].str[9:11]

In [55]:
ethnicity_per_county_gdf = ethnicity_per_county_gdf.to_crs(9311)
ethnicity_per_county_gdf.to_file("data/african_ancestry_per_county.gpkg")

In [57]:
pd.DataFrame(
    ethnicity_per_county_df.groupby("ancestry_countries_2_percent").size(),
    columns=["count"],
).sort_values("count")

Unnamed: 0_level_0,count
ancestry_countries_2_percent,Unnamed: 1_level_1
South African,1
Liberian,2
Sudanese,2
Ghanaian,3
Cape Verdean,4
Nigerian,7
Ethiopian,10
Somali,13
