In [1]:
import geopandas as gpd
import pandas as pd

## Open GIS Data

In [2]:
file_path = "data/cb_2018_us_state_500k/cb_2018_us_state_500k.shp"
states_df = gpd.read_file(file_path)

## Get Ethnic Data

In [3]:
acs_ethnicity = pd.read_csv("data/ACSDT5Y2022.B04006-2024-11-26T060413.csv")

In [4]:
rename_columns = {}
for column in list(acs_ethnicity.columns):
    str_split = column.split("!!")
    if len(str_split) == 2:
        if str_split[1] == "Estimate":
            rename_columns[column] = str_split[0].strip()
    else:
        continue

In [5]:
states = list(rename_columns.values())
states.remove("Peoples township, Boone County, Iowa")

In [6]:
rename_columns["Label (Grouping)"] = "group"
states.append("group")
acs_ethnicity = acs_ethnicity.rename(columns=rename_columns)

In [7]:
acs_ethnicity = acs_ethnicity[states]
states.remove("group")

#### Remove the last record as it's the titles, set then remove.

In [8]:
acs_ethnicity = acs_ethnicity.T.reset_index()
acs_ethnicity.columns = acs_ethnicity.iloc[52]
acs_ethnicity = acs_ethnicity.drop(acs_ethnicity.index[52])

In [9]:
states_series = acs_ethnicity["group"]

In [10]:
acs_ethnicity = acs_ethnicity.loc[:, acs_ethnicity.columns != "group"].apply(
    lambda col: pd.to_numeric(col.str.replace(",", ""), errors="coerce"), axis=1
)

In [11]:
acs_ethnicity["NAME"] = states_series

In [12]:
rename_columns = {col: col.strip() for col in list(acs_ethnicity.columns)}
acs_ethnicity = acs_ethnicity.rename(columns=rename_columns)

## Get Difference Data

In [31]:
df = acs_ethnicity[["NAME", "Total:", "Czech", "Slovak"]]

In [37]:
acs_ethnicity[acs_ethnicity["Czech"] > acs_ethnicity["Czechoslovakian"]]

52,Total:,Afghan,Albanian,Alsatian,American,Arab:,Egyptian,Iraqi,Jordanian,Lebanese,...,Haitian,Jamaican,Trinidadian and Tobagonian,U.S. Virgin Islander,West Indian,Other West Indian,Yugoslavian,Other groups,Unclassified or not reported,NAME
0,5028092,213,170,0,748204,13718,946,127,142,5199,...,2622,6082,682,223,1999,16,744,1701194,1364081,Alabama
1,734821,43,333,12,25100,905,116,5,0,290,...,355,504,102,0,115,0,296,287226,162298,Alaska
2,7172282,2930,2338,168,278571,37242,3893,6788,2059,9970,...,2828,7410,1306,201,2122,198,6026,2881683,1536729,Arizona
3,3018669,57,111,52,263371,5553,690,221,106,1463,...,737,805,417,43,918,136,259,979004,961058,Arkansas
4,39356104,61122,5268,758,1134635,324963,58380,24399,15752,63675,...,13930,38751,6615,650,9156,669,18769,21628661,7164954,California
5,5770790,1928,1063,281,206114,25914,1969,1997,825,7416,...,1450,4195,691,153,1175,24,3869,1678433,1359522,Colorado
6,3611317,2240,11763,220,130297,25093,2502,440,533,9225,...,18950,60824,3924,333,5114,176,4434,989507,648917,Connecticut
7,993635,237,159,22,48089,3898,574,6,64,799,...,5023,5594,1137,55,1060,0,354,294948,242117,Delaware
8,670587,141,579,63,18427,6041,613,432,73,1934,...,1484,6074,2117,98,1501,48,162,335012,110414,District of Columbia
9,21634529,1756,16814,316,1729810,137361,17814,4086,4686,43481,...,496400,309454,39566,8102,34477,1583,14834,7976728,4409481,Florida


In [32]:
def more(row) -> str:
    if row["Czech"] > row["Slovak"]:
        return "More Czech"
    else:
        return "More Slovak"

In [33]:
df["more"] = df.apply(lambda row: more(row), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["more"] = df.apply(lambda row: more(row), axis=1)


In [34]:
df[df["more"] == "More Slovak"]

52,NAME,Total:,Czech,Slovak,more
6,Connecticut,3611317,9904,16347,More Slovak
7,Delaware,993635,1681,1874,More Slovak
14,Indiana,6784403,14577,17775,More Slovak
30,New Jersey,9249063,20074,28811,More Slovak
35,Ohio,11774683,52918,104647,More Slovak
38,Pennsylvania,12989208,42110,167879,More Slovak
48,West Virginia,1792967,2762,3800,More Slovak


## Merge Data

In [24]:
diff_gdf = states_df[["geometry", "NAME"]].merge(df, on="NAME", how="left")

In [25]:
diff_gdf = diff_gdf.to_crs(9311)

In [26]:
diff_gdf.to_file("data/diff.gpkg")