In [1]:
import geopandas as gpd
import pandas as pd

## Open GIS Data

In [2]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

## Get Ethnic Data

In [3]:
acs_ethnicity = pd.read_csv("data/ACSDT5Y2022.B04006-2024-11-26T060413.csv")

In [4]:
rename_columns = {}
for column in list(acs_ethnicity.columns):
    str_split = column.split("!!")
    if len(str_split) == 2:
        if str_split[1] == "Estimate":
            rename_columns[column] = str_split[0].strip()
    else:
        continue

In [5]:
states = list(rename_columns.values())
states.remove("Peoples township, Boone County, Iowa")

In [6]:
rename_columns["Label (Grouping)"] = "group"
states.append("group")
acs_ethnicity = acs_ethnicity.rename(columns=rename_columns)

In [7]:
acs_ethnicity = acs_ethnicity[states]
states.remove("group")

#### Remove the last record as it's the titles, set then remove.

In [8]:
acs_ethnicity = acs_ethnicity.T.reset_index()
acs_ethnicity.columns = acs_ethnicity.iloc[52]
acs_ethnicity = acs_ethnicity.drop(acs_ethnicity.index[52])

In [9]:
states_series = acs_ethnicity["group"]

In [10]:
acs_ethnicity = acs_ethnicity.loc[:, acs_ethnicity.columns != "group"].apply(
    lambda col: pd.to_numeric(col.str.replace(",", ""), errors="coerce"), axis=1
)

In [11]:
acs_ethnicity["NAME"] = states_series

In [12]:
rename_columns = {col: col.strip() for col in list(acs_ethnicity.columns)}
acs_ethnicity = acs_ethnicity.rename(columns=rename_columns)

## Get Scotch-Irish Data

In [13]:
si_df = acs_ethnicity[["NAME", "Total:", "Scotch-Irish"]]

In [14]:
si_df["per_1000"] = si_df["Scotch-Irish"] / (si_df["Total:"] / 1000)
si_df["per_10k"] = si_df["Scotch-Irish"] / (si_df["Total:"] / 10_000)
si_df["per_100k"] = (
    (si_df["Scotch-Irish"] / (si_df["Total:"] / 100000)).round(decimals=0).astype(int)
)
si_df["per_500k"] = (
    (si_df["Scotch-Irish"] / (si_df["Total:"] / 500_000)).round(decimals=0).astype(int)
)
si_df["per_1m"] = (
    (si_df["Scotch-Irish"] / (si_df["Total:"] / 1_000_000))
    .round(decimals=0)
    .astype(int)
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  si_df["per_1000"] = si_df["Scotch-Irish"] / (si_df["Total:"] / 1000)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  si_df["per_10k"] = si_df["Scotch-Irish"] / (si_df["Total:"] / 10_000)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  si_df["per_100k"] = (
A value is trying to be set on a copy of a s

## Merge Data

In [17]:
si_per_state_df = states_df[["geometry", "NAME"]].merge(si_df, on="NAME", how="left")

In [18]:
si_per_state_df = si_per_state_df.to_crs(9311)

In [19]:
si_per_state_df.to_file("data/si.gpkg")