In [1]:
import requests
from functools import reduce

In [2]:
import numpy as np
import geopandas as gpd
import pandas as pd

In [3]:
from tqdm.notebook import tqdm

## Open State data

In [4]:
file_path = "data/cb_2024_us_county_500k/cb_2024_us_county_500k.shp"
counties_gdf = gpd.read_file(file_path)

In [5]:
counties_gdf = counties_gdf.rename(columns={"GEOIDFQ": "AFFGEOID"})

## Get Ethnic Data

In [6]:
r = requests.get("https://api.census.gov/data/2023/acs/acs5/groups/B03001.json")
columns_obj = r.json()

In [7]:
columns = []
rename_vars = {}
variables = columns_obj["variables"]
for name, variable in list(variables.items()):
    v_split = variable["label"].split("!!")

    if v_split[0] == "Estimate":
        label = v_split[-1]
        rename_vars[name] = label

    if name.endswith("EA") or name.endswith("MA") or name == "GEO_ID":
        continue
    else:
        columns.append(name)

In [8]:
rename_vars

{'B03001_020E': 'Colombian',
 'B03001_022E': 'Paraguayan',
 'B03001_021E': 'Ecuadorian',
 'B03001_028E': 'Spaniard',
 'B03001_027E': 'Other Hispanic or Latino:',
 'B03001_029E': 'Spanish',
 'B03001_024E': 'Uruguayan',
 'B03001_023E': 'Peruvian',
 'B03001_026E': 'Other South American',
 'B03001_025E': 'Venezuelan',
 'B03001_031E': 'All other Hispanic or Latino',
 'B03001_030E': 'Spanish American',
 'B03001_004E': 'Mexican',
 'B03001_003E': 'Hispanic or Latino:',
 'B03001_006E': 'Cuban',
 'B03001_005E': 'Puerto Rican',
 'B03001_002E': 'Not Hispanic or Latino',
 'B03001_001E': 'Total:',
 'B03001_008E': 'Central American:',
 'B03001_009E': 'Costa Rican',
 'B03001_007E': 'Dominican (Dominican Republic)',
 'B03001_010E': 'Guatemalan',
 'B03001_016E': 'South American:',
 'B03001_015E': 'Other Central American',
 'B03001_018E': 'Bolivian',
 'B03001_017E': 'Argentinean',
 'B03001_012E': 'Nicaraguan',
 'B03001_011E': 'Honduran',
 'B03001_014E': 'Salvadoran',
 'B03001_013E': 'Panamanian',
 'B0300

In [9]:
dfs = []
for i in tqdm(range(0, len(columns), 49), desc="Requesting data in batches"):
    columns_with_geoid = columns[i : i + 49]
    columns_with_geoid.append("GEO_ID")
    columns_formatted = ",".join(columns_with_geoid)
    url = f"https://api.census.gov/data/2023/acs/acs5?get={columns_formatted}&for=county:*"
    response = requests.get(url)
    data = response.json()
    df = pd.DataFrame(data[1:], columns=data[0])

    df = df.drop(columns=["state", "county"], errors="ignore")
    dfs.append(df)

Requesting data in batches:   0%|          | 0/2 [00:00<?, ?it/s]

In [28]:
df = reduce(lambda left, right: pd.merge(left, right, on="GEO_ID", how="inner"), dfs)

In [29]:
df = df.replace(np.nan, 0)

In [30]:
estimate_cols = [col for col in df.columns if col.endswith("E") and col != "NAME"]

In [31]:
formtted_df = df[["GEO_ID", *estimate_cols]]
formtted_df = formtted_df.drop(columns=["NAME"], errors="ignore")

In [32]:
formtted_df[estimate_cols] = formtted_df[estimate_cols].astype(int)

In [33]:
formtted_df["most_common_origin_raw"] = formtted_df[
    [
        col
        for col in estimate_cols
        if col
        not in [
            "B03001_001E",
            "B03001_002E",
            "B03001_003E",
            "B03001_008E",
            "B03001_016E",
            "B03001_027E",
            "B03001_031E",
        ]
    ]
].idxmax(axis=1)

In [34]:
def check_margin_error(row) -> str:
    geo_id = row["GEO_ID"]
    ethnicity_col = row["most_common_origin_raw"]
    val = row[ethnicity_col]

    if not val:
        return None

    moe_col = ethnicity_col.replace("E", "M")
    moe_val = int(df[df["GEO_ID"] == geo_id][moe_col])

    rmoe_val = abs(moe_val / val)
    if rmoe_val < 0.25:
        return variables[ethnicity_col]["label"].split("!!")[-1]
    else:
        return None

In [35]:
formtted_df["most_common_origin"] = formtted_df.apply(
    lambda row: check_margin_error(row), axis=1
)

  moe_val = int(df[df["GEO_ID"] == geo_id][moe_col])


In [36]:
rename_vars["GEO_ID"] = "AFFGEOID"
formtted_df = formtted_df.rename(columns=rename_vars)

In [37]:
formtted_df.groupby("most_common_origin").size().reset_index(name="COUNT").sort_values(
    "COUNT", ascending=False
)

Unnamed: 0,most_common_origin,COUNT
5,Mexican,1729
6,Puerto Rican,214
7,Salvadoran,16
1,Dominican (Dominican Republic),11
0,Cuban,6
4,Honduran,6
3,Guatemalan,4
2,Ecuadorian,2


## Merge Data

In [38]:
ethnicity_gdf = counties_gdf.merge(formtted_df, on="AFFGEOID", how="inner")

In [39]:
ethnicity_gdf = ethnicity_gdf.to_crs(9311)
ethnicity_gdf.to_file("data/Hispanic_Ancestry_Per_County.gpkg")