In [90]:
import requests
from functools import reduce

In [91]:
import numpy as np
import geopandas as gpd
import pandas as pd

In [92]:
from tqdm.notebook import tqdm

## Open State data

In [93]:
file_path = "data/cb_2024_us_county_500k/cb_2024_us_county_500k.shp"
counties_gdf = gpd.read_file(file_path)

## Get Ethnic Data

In [94]:
r = requests.get("https://api.census.gov/data/2023/acs/acs5/groups/B04006.json")
columns_obj = r.json()

In [95]:
columns = []
rename_vars = {}
variables = columns_obj["variables"]
for name, variable in list(variables.items()):
    v_split = variable["label"].split("!!")

    if v_split[0] == "Estimate":
        label = v_split[-1]
        rename_vars[name] = label

    if name.endswith("EA") or name.endswith("MA") or name == "GEO_ID":
        continue
    else:
        columns.append(name)

In [96]:
dfs = []
for i in tqdm(range(0, len(columns), 49), desc="Requesting data in batches"):
    columns_with_geoid = columns[i : i + 49]
    columns_with_geoid.append("GEO_ID")
    columns_formatted = ",".join(columns_with_geoid)
    url = f"https://api.census.gov/data/2023/acs/acs5?get={columns_formatted}&for=county:*"
    response = requests.get(url)
    data = response.json()
    df = pd.DataFrame(data[1:], columns=data[0])

    df = df.drop(columns=["state", "county"], errors="ignore")
    dfs.append(df)

Requesting data in batches:   0%|          | 0/5 [00:00<?, ?it/s]

In [97]:
df = reduce(lambda left, right: pd.merge(left, right, on="GEO_ID", how="inner"), dfs)

In [98]:
df = df.replace(np.nan, 0)

In [118]:
estimate_cols = [col for col in df.columns if col.endswith("E") and col != "NAME"]

In [119]:
formtted_df = df[["GEO_ID", *estimate_cols]]
formtted_df = formtted_df.drop(columns=["NAME"], errors="ignore")

In [120]:
formtted_df[estimate_cols] = formtted_df[estimate_cols].astype(int)

In [143]:
formtted_df["largest_ancestry"] = formtted_df[
    [
        col
        for col in estimate_cols
        if col not in ["B04006_001E", "B04006_108E", "B04006_109E"]
    ]
].idxmax(axis=1)

In [144]:
formtted_df["largest_ancestry"]

0       B04006_036E
1       B04006_005E
2       B04006_005E
3       B04006_005E
4       B04006_036E
           ...     
3217    B04006_005E
3218    B04006_005E
3219    B04006_005E
3220    B04006_005E
3221    B04006_005E
Name: largest_ancestry, Length: 3222, dtype: object

In [145]:
def check_majority(row) -> str:
    geo_id = row["GEO_ID"]
    largest_ancestry = row["largest_ancestry"]
    total_pop = row["B04006_001E"]
    val = row[largest_ancestry]

    if not val:
        return None

    if val / total_pop > 0.50:
        return variables[largest_ancestry]["label"].split("!!")[-1]
    else:
        return None

In [146]:
formtted_df["largest_ancestry_majority"] = formtted_df.apply(
    lambda row: check_majority(row), axis=1
)

In [147]:
formtted_df[~formtted_df["largest_ancestry_majority"].isna()]

Unnamed: 0,GEO_ID,B04006_009E,B04006_008E,B04006_007E,B04006_001E,B04006_006E,B04006_005E,B04006_004E,B04006_003E,B04006_002E,...,B04006_105E,B04006_104E,B04006_103E,B04006_109E,B04006_108E,B04006_102E,B04006_101E,B04006_100E,largest_ancestry,largest_ancestry_majority
603,0500000US17013,0,0,0,4406,2,158,0,0,0,...,0,0,0,978,256,0,0,0,B04006_042E,German
804,0500000US19027,0,0,0,20677,0,1092,0,0,0,...,0,0,0,5240,1396,0,0,0,B04006_042E,German
809,0500000US19037,0,0,0,11868,0,770,0,0,0,...,0,0,0,2474,1072,0,0,0,B04006_042E,German
955,0500000US20131,0,0,0,10213,0,845,0,0,0,...,0,0,0,2390,851,0,0,4,B04006_042E,German
972,0500000US20165,0,0,0,2957,1,69,0,0,0,...,0,0,0,452,350,0,0,0,B04006_042E,German
987,0500000US20195,0,0,0,2778,0,63,0,0,0,...,0,0,0,643,176,0,0,0,B04006_042E,German
1323,0500000US27015,0,0,0,25826,23,817,0,0,0,...,7,0,0,4721,1995,0,0,0,B04006_042E,German
1387,0500000US27143,0,0,0,14933,10,402,0,0,0,...,0,0,0,2586,1688,0,0,0,B04006_042E,German
1560,0500000US29151,0,0,0,13379,0,1171,0,0,0,...,0,0,0,2908,1012,0,0,0,B04006_042E,German
1669,0500000US31027,0,0,0,8344,13,413,0,0,0,...,0,0,0,1811,605,0,0,0,B04006_042E,German


In [148]:
rename_vars["GEO_ID"] = "GEOIDFQ"
formtted_df = formtted_df.rename(columns=rename_vars)

In [149]:
formtted_df.groupby("largest_ancestry_majority").size().reset_index(
    name="COUNT"
).sort_values("COUNT", ascending=False)

Unnamed: 0,largest_ancestry_majority,COUNT
1,German,34
0,American,2


## Merge Data

In [150]:
gdf = counties_gdf.merge(formtted_df, on="GEOIDFQ", how="inner")

In [151]:
gdf = gdf.to_crs(9311)
gdf.to_file("data/Largest_White_Ancestry_Majority_Only_Per_County.gpkg")