In [1]:
import requests

In [2]:
import geopandas as gpd
import pandas as pd

## Open County data

In [3]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
state_gdf = gpd.read_file(file_path)

## Get Ethnic Data

In [4]:
r = requests.get("https://api.census.gov/data/2023/acs/acs5/groups/B04006.json")
columns_obj = r.json()

### Get columns to query and rename for later

In [None]:
balkan_cols = [
    "Albanian",
    "Bulgarian",
    "Croatian",
    "Macedonian",
    # "Greek",
    "Romanian",
    "Serbian",
    "Slovene",
    "Yugoslavian",
]

In [21]:
columns = []
rename_vars = {}
variables = columns_obj["variables"]
for name, variable in list(variables.items()):
    v_split = variable["label"].split("!!")
    if len(v_split) < 3:
        continue

    if v_split[0] == "Estimate":
        label = v_split[-1]
        rename_vars[name] = label

    if (name.endswith("E") or name.endswith("M")) and v_split[-1] in balkan_cols:
        columns.append(name)

In [22]:
columns

['B04006_003M',
 'B04006_003E',
 'B04006_024M',
 'B04006_024E',
 'B04006_029M',
 'B04006_029E',
 'B04006_055M',
 'B04006_055E',
 'B04006_071M',
 'B04006_071E',
 'B04006_063M',
 'B04006_063E',
 'B04006_068M',
 'B04006_068E',
 'B04006_107E',
 'B04006_107M']

In [23]:
columns.append("GEO_ID")
columns_formatted = ",".join(columns)

In [24]:
response = requests.get(
    f"https://api.census.gov/data/2023/acs/acs5?get={columns_formatted}&for=state:*"
)

In [25]:
data = response.json()
columns = data[0]
rows = data[1:]
df = pd.DataFrame(rows, columns=columns)

In [26]:
estimate_cols = [col for col in df.columns if col.endswith("E")]

In [27]:
formtted_df = df[["GEO_ID", *estimate_cols]]
formtted_df[estimate_cols] = formtted_df[estimate_cols].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  formtted_df[estimate_cols] = formtted_df[estimate_cols].astype(int)


In [28]:
formtted_df["most_common_ancestry_raw"] = formtted_df[estimate_cols].idxmax(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  formtted_df["most_common_ancestry_raw"] = formtted_df[estimate_cols].idxmax(axis=1)


In [29]:
def check_margin_error(row) -> str:
    geo_id = row["GEO_ID"]
    ethnicity_col = row["most_common_ancestry_raw"]
    val = row[ethnicity_col]

    if not val:
        return None

    moe_col = ethnicity_col.replace("E", "M")
    moe_val = int(df[df["GEO_ID"] == geo_id][moe_col])

    rmoe_val = abs(moe_val / val)
    return variables[ethnicity_col]["label"].split("!!")[-1]

In [30]:
formtted_df["most_common_ancestry"] = formtted_df.apply(
    lambda row: check_margin_error(row), axis=1
)

  moe_val = int(df[df["GEO_ID"] == geo_id][moe_col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  formtted_df["most_common_ancestry"] = formtted_df.apply(


In [31]:
formtted_df.groupby("most_common_ancestry").size().reset_index(
    name="COUNT"
).sort_values("COUNT", ascending=False)

Unnamed: 0,most_common_ancestry,COUNT
2,Romanian,23
1,Croatian,15
4,Yugoslavian,8
0,Albanian,5
3,Slovene,1


In [33]:
rename_vars["GEO_ID"] = "AFFGEOID"
formtted_df = formtted_df.rename(columns=rename_vars)

## Merge Data

In [34]:
gdf = state_gdf.merge(formtted_df, on="AFFGEOID", how="inner")

In [35]:
gdf = gdf.to_crs(9311)
gdf.to_file("data/Balkan_Ancestry_Per_State.gpkg")