In [21]:
import geopandas as gpd
import pandas as pd

## Open State data

In [22]:
file_path = "data/cb_2024_us_county_500k/cb_2024_us_county_500k.shp"
counties_gdf = gpd.read_file(file_path)

## Get Ethnicity amounts

In [23]:
acs_ethnicity = pd.read_csv(
    "data/ACSDT5Y2023.B04006_2025-05-02T185520/ACSDT5Y2023.B04006-Data.csv",
    low_memory=False,
)

#### Get Column names

In [24]:
acs_column_names = pd.read_csv(
    "data/ACSDT5Y2023.B04006_2025-05-02T185520/ACSDT5Y2023.B04006-Column-Metadata.csv"
)

In [25]:
ethnicity_cols = list(
    acs_column_names[
        (acs_column_names["Column Name"] == "B04006_007E")
        | (acs_column_names["Column Name"] == "B04006_008E")
        | (acs_column_names["Column Name"] == "B04006_009E")
        | (acs_column_names["Column Name"] == "B04006_010E")
        | (acs_column_names["Column Name"] == "B04006_011E")
        | (acs_column_names["Column Name"] == "B04006_012E")
        | (acs_column_names["Column Name"] == "B04006_013E")
        | (acs_column_names["Column Name"] == "B04006_014E")
        | (acs_column_names["Column Name"] == "B04006_015E")
        | (acs_column_names["Column Name"] == "B04006_001E")
    ]["Column Name"]
)

#### Back to Data

In [26]:
rename_cols = {
    "Estimate!!Total:!!Arab:!!Egyptian": "Egyptian",
    "Estimate!!Total:!!Arab:!!Iraqi": "Iraqi",
    "Estimate!!Total:!!Arab:!!Jordanian": "Jordanian",
    "Estimate!!Total:!!Arab:!!Lebanese": "Lebanese",
    "Estimate!!Total:!!Arab:!!Moroccan": "Moroccan",
    "Estimate!!Total:!!Arab:!!Palestinian": "Palestinian",
    "Estimate!!Total:!!Arab:!!Syrian": "Syrian",
    "Estimate!!Total:!!Arab:!!Arab": "Arab",
    "Estimate!!Total:!!Arab:!!Other Arab": "Other Arab",
    "Geography": "GEOIDFQ",
    "Estimate!!Total:": "TOTAL",
}

In [27]:
ethnicity_per_county_df = acs_ethnicity[["GEO_ID", *ethnicity_cols]]
ethnicity_per_county_df.columns = ethnicity_per_county_df.iloc[0]
ethnicity_per_county_df = ethnicity_per_county_df[1:]
ethnicity_per_county_df = ethnicity_per_county_df.rename(columns=rename_cols)

In [28]:
ancestry_cols = list(rename_cols.values())[:-2]

In [29]:
ethnicity_per_county_df[ancestry_cols] = ethnicity_per_county_df[ancestry_cols].astype(
    int
)

In [30]:
ethnicity_per_county_df["ancestry_total"] = ethnicity_per_county_df[ancestry_cols].sum(
    axis=1
)
ethnicity_per_county_df["ancestry_total_percent"] = ethnicity_per_county_df[
    "ancestry_total"
] / ethnicity_per_county_df["TOTAL"].astype(int)

## Merge Data

In [31]:
ethnicity_per_county_df["ancestry_countries_1_percent"] = ethnicity_per_county_df[
    ethnicity_per_county_df["ancestry_total_percent"] >= 0.01
][ancestry_cols].idxmax(axis=1)
ethnicity_per_county_df["ancestry_countries_2_percent"] = ethnicity_per_county_df[
    ethnicity_per_county_df["ancestry_total_percent"] >= 0.02
][ancestry_cols].idxmax(axis=1)
ethnicity_per_county_df["ancestry_countries_5_percent"] = ethnicity_per_county_df[
    ethnicity_per_county_df["ancestry_total_percent"] >= 0.05
][ancestry_cols].idxmax(axis=1)

In [45]:
len(
    ethnicity_per_county_df[
        ~ethnicity_per_county_df["ancestry_countries_1_percent"].isna()
    ]
)

103

In [46]:
len(
    ethnicity_per_county_df[
        ~ethnicity_per_county_df["ancestry_countries_2_percent"].isna()
    ]
)

22

In [47]:
len(
    ethnicity_per_county_df[
        ~ethnicity_per_county_df["ancestry_countries_5_percent"].isna()
    ]
)

2

In [50]:
ethnicity_per_county_df[~ethnicity_per_county_df["ancestry_countries_5_percent"].isna()]

Unnamed: 0,GEOIDFQ,TOTAL,Egyptian,Iraqi,Jordanian,Lebanese,Moroccan,Palestinian,Syrian,Arab,Other Arab,ancestry_total,ancestry_total_percent,ancestry_countries_1_percent,ancestry_countries_2_percent,ancestry_countries_5_percent
1315,0500000US26163,1773767,1018,11778,2191,44747,1408,3545,3999,17646,28797,115129,0.064906,Lebanese,Lebanese,Lebanese
2612,0500000US48173,1070,0,0,49,0,0,0,11,0,0,60,0.056075,Jordanian,Jordanian,Jordanian


In [32]:
ethnicity_per_county_gdf = counties_gdf.merge(
    ethnicity_per_county_df, on="GEOIDFQ", how="left"
)

In [33]:
ethnicity_per_county_gdf["county_id"] = ethnicity_per_county_gdf["GEOIDFQ"].str[9:]
ethnicity_per_county_gdf["state_id"] = ethnicity_per_county_gdf["GEOIDFQ"].str[9:11]

In [34]:
ethnicity_per_county_gdf = ethnicity_per_county_gdf.to_crs(9311)
ethnicity_per_county_gdf.to_file("data/arab_ancestry_per_county.gpkg")

In [58]:
pd.DataFrame(
    ethnicity_per_county_df.groupby("ancestry_countries_2_percent").size(),
    columns=["count"],
).sort_values("count")

Unnamed: 0_level_0,count
ancestry_countries_2_percent,Unnamed: 1_level_1
Arab,1
Jordanian,1
Moroccan,1
Iraqi,2
Lebanese,2
Syrian,3
Egyptian,5
Other Arab,7
