In [1]:
import geopandas as gpd
import pandas as pd

## Open State data

In [2]:
file_path = "data/cb_2018_us_county_500k/cb_2018_us_county_500k.shp"
counties_gdf = gpd.read_file(file_path)
counties_gdf = counties_gdf[counties_gdf["STATEFP"] != "09"][["geometry", "AFFGEOID"]]
counties_gdf = counties_gdf.to_crs(4326)

#### Format for CT

In [3]:
ct_gdf = gpd.read_file("data/CT_Planning_Regions.geojson")
ct_gdf["AFFGEOID"] = "0500000US" + ct_gdf["PlanningRegionFIPS_GEOID"]

In [4]:
counties_gdf = pd.concat(
    [ct_gdf[["geometry", "AFFGEOID"]], counties_gdf], ignore_index=True
)

## Get Ethnicity amounts

In [5]:
acs_ethnicity = pd.read_csv(
    "data/ACSDT5Y2023.B04006_2025-05-02T185520/ACSDT5Y2023.B04006-Data.csv",
    low_memory=False,
)

#### Get Column names

In [6]:
acs_column_names = pd.read_csv(
    "data/ACSDT5Y2023.B04006_2025-05-02T185520/ACSDT5Y2023.B04006-Column-Metadata.csv"
)

In [7]:
ethnicity_cols = list(
    acs_column_names[
        (acs_column_names["Column Name"] == "B04006_042E")
        | (acs_column_names["Column Name"] == "B04006_036E")
    ]["Column Name"]
)

#### Back to Data

In [8]:
ethnicity_per_county_df = acs_ethnicity[["GEO_ID", *ethnicity_cols]]
ethnicity_per_county_df.columns = ethnicity_per_county_df.iloc[0]
ethnicity_per_county_df = ethnicity_per_county_df[1:]
ethnicity_per_county_df = ethnicity_per_county_df.rename(
    columns={
        "Estimate!!Total:!!English": "ENGLISH",
        "Estimate!!Total:!!German": "GERMAN",
        "Geography": "AFFGEOID",
    }
)

## Merge Data

In [9]:
ethnicity_per_county_gdf = counties_gdf.merge(
    ethnicity_per_county_df, on="AFFGEOID", how="left"
)

In [10]:
ethnicity_per_county_gdf["county_id"] = ethnicity_per_county_gdf["AFFGEOID"].str[9:]
ethnicity_per_county_gdf["state_id"] = ethnicity_per_county_gdf["AFFGEOID"].str[9:11]

In [11]:
ethnicity_per_county_gdf = ethnicity_per_county_gdf.fillna(0)
ethnicity_per_county_gdf["ENGLISH"] = ethnicity_per_county_gdf["ENGLISH"].astype(int)
ethnicity_per_county_gdf["GERMAN"] = ethnicity_per_county_gdf["GERMAN"].astype(int)

In [12]:
def create_label(row) -> str:
    english = row["ENGLISH"]
    german = row["GERMAN"]
    if english == german and english == 0:
        return "No Reported Ancestry"
    if english == german:
        return "Same Amount"
    if english > german:
        return "More English"
    if english < german:
        return "More German"
    return "Edge Case"

In [13]:
ethnicity_per_county_gdf["label"] = ethnicity_per_county_gdf.apply(
    lambda row: create_label(row), axis=1
)

In [14]:
ethnicity_per_county_gdf[ethnicity_per_county_gdf["label"] == "Edge Case"]

Unnamed: 0,geometry,AFFGEOID,ENGLISH,GERMAN,county_id,state_id,label


In [15]:
ethnicity_per_county_gdf = ethnicity_per_county_gdf.to_crs(9311)
ethnicity_per_county_gdf.to_file("data/english_vs_german_county.gpkg")