In [44]:
import geopandas as gpd
import pandas as pd

## Open State data

In [45]:
file_path = "data/cb_2024_us_county_500k/cb_2024_us_county_500k.shp"
counties_gdf = gpd.read_file(file_path)
counties_gdf = counties_gdf.to_crs(4326)

## Get Language 

In [46]:
language_df = pd.read_csv(
    "data/ACSST5Y2023.S1601_2025-06-22T151444/ACSST5Y2023.S1601-Data.csv",
    low_memory=False,
)

#### Get Column names

In [47]:
acs_column_names = pd.read_csv(
    "data/ACSST5Y2023.S1601_2025-06-22T151444/ACSST5Y2023.S1601-Column-Metadata.csv"
)

In [48]:
col_renames = {
    "Estimate!!Percent!!Population 5 years and over!!Speak only English": "PERCENT_ENGLISH",
    "Estimate!!Percent!!Population 5 years and over!!Speak a language other than English": "PERCENT_MORE_THAN_ENGLISH",
    "Geography": "GEOIDFQ",
}

In [49]:
language_cols = list(
    acs_column_names[
        (acs_column_names["Column Name"] == "S1601_C02_002E")
        | (acs_column_names["Column Name"] == "S1601_C02_003E")
    ]["Column Name"]
)

#### Back to Data

In [50]:
language_df = language_df[["GEO_ID", *language_cols]]
language_df.columns = language_df.iloc[0]
language_df = language_df[1:]
language_df = language_df.rename(columns=col_renames)

In [52]:
language_df["PERCENT_ENGLISH"] = language_df["PERCENT_ENGLISH"].astype(float)
language_df["PERCENT_MORE_THAN_ENGLISH"] = language_df[
    "PERCENT_MORE_THAN_ENGLISH"
].astype(float)

## Merge Data

In [53]:
language_gdf = counties_gdf.merge(language_df, on="GEOIDFQ", how="left")

In [54]:
language_gdf = language_gdf.dropna()

In [55]:
language_gdf = language_gdf.to_crs(9311)
language_gdf.to_file("data/language.gpkg")