In [1]:
from functools import reduce
import requests

In [2]:
import geopandas as gpd
import numpy as np
import pandas as pd

In [3]:
from tqdm.notebook import tqdm

## Open County data

In [4]:
file_path = "data/cb_2024_us_county_500k/cb_2024_us_county_500k.shp"
counties_gdf = gpd.read_file(file_path)

## Get Ethnic Data

In [5]:
r = requests.get("https://api.census.gov/data/2023/acs/acs5/groups/B05006.json")
columns_obj = r.json()

### Get columns to query and rename for later

In [6]:
columns = []
rename_vars = {}
variables = columns_obj["variables"]
for name, variable in list(variables.items()):
    v_split = variable["label"].split("!!")
    if len(v_split) < 4:
        continue

    if v_split[0] == "Estimate":
        label = v_split[-1]
        rename_vars[name] = label

    if name.endswith("E") or name.endswith("M"):
        columns.append(name)

In [7]:
dfs = []
for i in tqdm(range(0, len(columns), 49), desc="Requesting data in batches"):
    columns_with_geoid = columns[i : i + 49]
    columns_with_geoid.append("GEO_ID")
    columns_formatted = ",".join(columns_with_geoid)
    url = f"https://api.census.gov/data/2023/acs/acs5?get={columns_formatted}&for=county:*"
    response = requests.get(url)
    data = response.json()
    df = pd.DataFrame(data[1:], columns=data[0])
    dfs.append(df)

Requesting data in batches:   0%|          | 0/8 [00:00<?, ?it/s]

In [10]:
dfs_clean = [
    df.drop(columns=["state", "county"], errors="ignore") if i > 0 else df
    for i, df in enumerate(dfs)
]

In [11]:
df = reduce(
    lambda left, right: pd.merge(left, right, on="GEO_ID", how="inner"), dfs_clean
)

In [12]:
df = df.replace(np.nan, 0)

In [13]:
estimate_cols = [col for col in df.columns if col.endswith("E")]

In [14]:
formtted_df = df[["GEO_ID", *estimate_cols]]
formtted_df[estimate_cols] = formtted_df[estimate_cols].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  formtted_df[estimate_cols] = formtted_df[estimate_cols].astype(int)


In [15]:
formtted_df

Unnamed: 0,GEO_ID,B05006_084E,B05006_085E,B05006_082E,B05006_083E,B05006_080E,B05006_081E,B05006_088E,B05006_089E,B05006_086E,...,B05006_140E,B05006_170E,B05006_177E,B05006_178E,B05006_175E,B05006_176E,B05006_173E,B05006_174E,B05006_171E,B05006_172E
0,0500000US01001,0,0,0,0,0,0,0,0,0,...,74,0,7,0,0,7,0,0,0,0
1,0500000US01003,0,0,0,10,0,0,0,0,0,...,530,59,564,0,0,564,72,72,0,71
2,0500000US01005,3,0,0,0,0,0,0,0,0,...,7,0,4,13,0,17,0,0,0,0
3,0500000US01007,0,0,0,0,0,0,0,0,0,...,11,0,0,0,0,0,0,0,0,0
4,0500000US01009,0,0,0,0,0,0,0,0,0,...,32,0,41,0,0,41,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3217,0500000US72145,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3218,0500000US72147,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3219,0500000US72149,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3220,0500000US72151,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
filtered_estimate_cols = {
    k: v for k, v in rename_vars.items() if k in estimate_cols and (":" not in v)
}

In [31]:
formtted_df["most_common_origin_raw"] = formtted_df[
    list(filtered_estimate_cols.keys())
].idxmax(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  formtted_df["most_common_origin_raw"] = formtted_df[


In [32]:
def check_margin_error(row) -> str:
    geo_id = row["GEO_ID"]
    ethnicity_col = row["most_common_origin_raw"]
    val = row[ethnicity_col]

    if not val:
        return None

    moe_col = ethnicity_col.replace("E", "M")
    moe_val = int(df[df["GEO_ID"] == geo_id][moe_col])

    rmoe_val = abs(moe_val / val)
    if rmoe_val < 0.50:
        return variables[ethnicity_col]["label"].split("!!")[-1]
    else:
        return None

In [33]:
formtted_df["most_common_origin"] = formtted_df.apply(
    lambda row: check_margin_error(row), axis=1
)

  moe_val = int(df[df["GEO_ID"] == geo_id][moe_col])
  formtted_df["most_common_origin"] = formtted_df.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  formtted_df["most_common_origin"] = formtted_df.apply(


In [34]:
formtted_df.groupby("most_common_origin").size().reset_index(name="COUNT").sort_values(
    "COUNT", ascending=False
)

Unnamed: 0,most_common_origin,COUNT
32,Mexico,1164
22,India,83
7,Canada,63
37,Philippines,50
8,"China, excluding Hong Kong and Taiwan",41
14,El Salvador,28
12,Dominican Republic,24
18,Guatemala,18
21,Honduras,15
10,Cuba,13


In [35]:
rename_vars["GEO_ID"] = "GEOIDFQ"
formtted_df = formtted_df.rename(columns=rename_vars)

## Merge Data

In [36]:
gdf = counties_gdf.merge(formtted_df, on="GEOIDFQ", how="inner")

In [41]:
gdf[gdf["STUSPS"] == "PA"]["most_common_origin"].unique()

array(['Dominican Republic', None, 'India', 'Ukraine', 'Germany',
       'Poland', 'Bhutan', 'China, excluding Hong Kong and Taiwan',
       'Mexico', 'Russia', 'Guatemala', 'Philippines', 'Italy',
       'St. Lucia'], dtype=object)

In [42]:
gdf.groupby("STUSPS")["most_common_european_origin"].nunique().reset_index(
    name="COUNT"
).sort_values("COUNT", ascending=False)

Unnamed: 0,STUSPS,COUNT
34,NY,14
38,PA,13
9,FL,12
24,MO,10
22,MI,9
46,VA,8
35,OH,8
23,MN,7
20,MD,7
15,IN,7


In [43]:
gdf = gdf.to_crs(9311)
gdf.to_file("data/Citizen_Origin_Country_Per_County.gpkg")