In [1]:
import requests

In [2]:
import geopandas as gpd
import pandas as pd

## Open State data

In [3]:
file_path = "data/cb_2024_us_county_500k/cb_2024_us_county_500k.shp"
counties_gdf = gpd.read_file(file_path)

## Get Language Data

In [4]:
r = requests.get("https://api.census.gov/data/2023/acs/acs1/groups/C16001.json")
columns_obj = r.json()

### Get columns to query and rename for later

In [5]:
columns = []
rename_vars = {}
variables = columns_obj["variables"]
for name, variable in list(variables.items()):
    v_split = variable["label"].split("!!")
    if len(v_split) != 3:
        continue

    if v_split[0] == "Estimate":
        label = v_split[-1]
        rename_vars[name] = label

    if name.endswith("E") or name.endswith("M"):
        columns.append(name)

In [6]:
for column in ["C16001_003E", "C16001_003M", "C16001_002E", "C16001_002M"]:
    columns.remove(column)

In [7]:
columns.append("GEO_ID")
columns_formatted = ",".join(columns)

In [8]:
response = requests.get(
    f"https://api.census.gov/data/2023/acs/acs5?get={columns_formatted}&for=county:*"  # &in=state:{state_code}"
)

In [9]:
data = response.json()
columns = data[0]
rows = data[1:]
language_df = pd.DataFrame(rows, columns=columns)

In [10]:
estimate_cols = [
    col for col in language_df.columns if col.endswith("E") or col.endswith("M")
]

In [11]:
language_formtted_df = language_df[["GEO_ID", *estimate_cols]]
language_formtted_df[estimate_cols] = language_formtted_df[estimate_cols].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  language_formtted_df[estimate_cols] = language_formtted_df[estimate_cols].astype(int)


In [12]:
language_formtted_df["most_common_language_raw"] = language_formtted_df[
    estimate_cols
].idxmax(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  language_formtted_df["most_common_language_raw"] = language_formtted_df[


In [13]:
def check_margin_error(row) -> str:
    geo_id = row["GEO_ID"]
    language_col = row["most_common_language_raw"]
    val = row[language_col]

    if not val:
        return None

    moe_col = language_col.replace("E", "M")
    moe_val = int(language_df[language_df["GEO_ID"] == geo_id][moe_col])

    rmoe_val = abs(moe_val / val)
    if rmoe_val < 0.50:
        return variables[language_col]["label"].split("!!")[-1][:-1]
    else:
        return None

In [14]:
language_formtted_df["most_common_language"] = language_formtted_df.apply(
    lambda row: check_margin_error(row), axis=1
)

  moe_val = int(language_df[language_df["GEO_ID"] == geo_id][moe_col])


In [15]:
rename_vars["GEO_ID"] = "AFFGEOID"
language_formtted_df = language_formtted_df.rename(columns=rename_vars)

In [16]:
language_formtted_df.groupby("most_common_language").size().reset_index(
    name="COUNT"
).sort_values("COUNT", ascending=False)

Unnamed: 0,most_common_language,COUNT
6,Other Indo-European languages,362
3,German or other West Germanic languages,240
7,Other and unspecified languages,172
5,Other Asian and Pacific Island languages,142
2,"French, Haitian, or Cajun",84
8,"Russian, Polish, or other Slavic languages",39
1,"Chinese (incl. Mandarin, Cantonese)",39
9,Tagalog (incl. Filipino),33
10,Vietnamese,20
0,Arabic,9


In [17]:
language_formtted_df = language_formtted_df.rename(columns={"AFFGEOID": "GEOIDFQ"})

## Merge Data

In [18]:
language_gdf = counties_gdf.merge(language_formtted_df, on="GEOIDFQ", how="inner")

In [19]:
language_gdf = language_gdf.to_crs(9311)
language_gdf.to_file("data/Language_Spoken_At_Home_Per_County_Not_English_Spanish.gpkg")