In [1]:
import requests

In [2]:
import geopandas as gpd
import pandas as pd

## Open County data

In [3]:
file_path = "data/cb_2024_us_county_500k/cb_2024_us_county_500k.shp"
counties_gdf = gpd.read_file(file_path)

## Get Pacific Island Data

In [4]:
r = requests.get("https://api.census.gov/data/2023/acs/acs5/groups/B02019.json")
columns_obj = r.json()

In [5]:
columns = [
    column
    for column in list(columns_obj["variables"].keys())
    if column.endswith("E") or column.endswith("M")
]
columns.append("GEO_ID")
columns_formatted = ",".join(columns)

In [6]:
response = requests.get(
    f"https://api.census.gov/data/2023/acs/acs5?get={columns_formatted}&for=county:*"
)

In [7]:
data = response.json()
columns = data[0]
rows = data[1:]
pi_df = pd.DataFrame(rows, columns=columns)

In [8]:
url = "https://api.census.gov/data/2023/acs/acs5/groups/B02019.json"
response = requests.get(url)
variables = response.json()["variables"]

In [9]:
rename_vars = {}
for k, v in variables.items():
    v_split = v["label"].split("!!")
    if v_split[0] == "Estimate":
        label = v_split[-1]
        rename_vars[k] = label

In [10]:
estimate_cols = [col for col in pi_df.columns if col.endswith("E")]
estimate_cols.remove("NAME")
estimate_cols.remove("B02019_001E")

In [11]:
pi_formatted_df = pi_df[["GEO_ID", *estimate_cols]]
pi_formatted_df[estimate_cols] = pi_formatted_df[estimate_cols].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pi_formatted_df[estimate_cols] = pi_formatted_df[estimate_cols].astype(int)


In [12]:
pi_formatted_df["most_common_ancestry_raw"] = pi_formatted_df[estimate_cols].idxmax(
    axis=1
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pi_formatted_df['most_common_ancestry_raw'] = pi_formatted_df[estimate_cols].idxmax(axis=1)


In [13]:
def check_margin_error(row) -> str:
    geo_id = row["GEO_ID"]
    ethnicity_col = row["most_common_ancestry_raw"]
    val = row[ethnicity_col]

    if not val:
        return None

    moe_col = ethnicity_col.replace("E", "M")
    moe_val = int(pi_df[pi_df["GEO_ID"] == geo_id][moe_col])

    rmoe_val = abs(moe_val / val)
    if rmoe_val < 0.50:
        return variables[ethnicity_col]["label"].split("!!")[-1]
    else:
        return None

In [14]:
pi_formatted_df["most_common_ancestry"] = pi_formatted_df.apply(
    lambda row: check_margin_error(row), axis=1
)

  moe_val = int(pi_df[pi_df['GEO_ID']==geo_id][moe_col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pi_formatted_df['most_common_ancestry'] = pi_formatted_df.apply(lambda row: check_margin_error(row), axis=1)


In [20]:
renamed_vars = {k: v["label"].split("!!")[-1] for k, v in variables.items()}
renamed_vars["Geography"] = "GEOIDFQ"

In [21]:
pi_formatted_df = pi_formatted_df.rename(columns=renamed_vars)

## Merge Data

In [23]:
pi_gdf = counties_gdf.merge(pi_formatted_df, on="GEOIDFQ", how="inner")

In [24]:
pi_gdf = pi_gdf.to_crs(9311)
pi_gdf.to_file("data/Largest_Pacific_Islander_Subgroup_Per_County.gpkg")

In [27]:
pi_gdf.groupby("most_common_ancestry").size().reset_index(name="COUNT").sort_values(
    "COUNT", ascending=False
)

Unnamed: 0,most_common_ancestry,COUNT
4,Native Hawaiian,109
6,Other Native Hawaiian and Other Pacific Island...,53
5,Other Micronesian,17
7,Samoan,14
0,Chamorro,13
3,Marshallese,11
8,Tongan,4
2,Fijian,3
1,Chuukese,2
