<a href="https://colab.research.google.com/github/worldterminator/mess/blob/main/hungergame.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install censusdata us requests pandas

import os, io, zipfile, requests, pandas as pd
import censusdata as cd
from us import states

# Get one: https://api.census.gov/data/key_signup.html
os.environ['CENSUS_KEY'] = ''
API_KEY = os.environ.get('CENSUS_KEY', None)

YEAR = 2022   # 2018–2022 5-year
DATASET_PROFILE = 'acs/acs5/profile'
DATASET_SUBJECT = 'acs/acs5/subject'
DATASET_DETAILED = 'acs/acs5'


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m355.9/355.9 kB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for censusdata (setup.py) ... [?25l[?25hdone


In [37]:
VARS_PROFILE = [
    # Demography
    "DP05_0019PE","DP05_0024PE","DP05_0071PE","DP05_0078PE","DP05_0079PE","DP05_0080PE",
    "DP02_0093PE","DP02_0113PE",
    # Socioeconomic
    "DP03_0119PE","DP03_0062E","DP03_0005PE","DP03_0002PE","DP02_0060PE","DP02_0068PE",
    "DP03_0074PE","DP03_0099PE","DP02_0072PE",
    # Household structure
    "DP02_0013PE","DP02_0011PE",
    # Housing / transport
    "DP04_0046PE","DP04_0047PE","DP04_0003PE","DP04_0078PE","DP04_0089PE","DP04_0090PE",
    "DP04_0134E","DP04_0139PE","DP04_0089E","DP04_0058PE",
    "DP03_0025E","DP03_0011PE","DP03_0012PE","DP03_0013PE","DP03_0016PE","DP03_0017PE","DP03_0024PE",
    "DP02_0154PE"  # broadband subscription (%)
]

In [38]:
VARS_DETAILED = [
    "B01003_001E",   # total population
    "B19083_001E"    # Gini index
]

In [39]:
RENAME_MAP = {
    "B01003_001E":"pop_total",
    "B19083_001E":"gini_index",
    "DP05_0019PE":"pct_under_18",
    "DP05_0024PE":"pct_65plus",
    "DP05_0071PE":"pct_hispanic",
    "DP05_0078PE":"pct_black",
    "DP05_0079PE":"pct_aian",
    "DP05_0080PE":"pct_asian",
    "DP02_0093PE":"pct_foreign_born",
    "DP02_0113PE":"pct_limited_english",
    "DP03_0119PE":"pct_poverty",
    "DP03_0062E":"med_hh_income",
    "DP03_0005PE":"pct_unemployed",
    "DP03_0002PE":"pct_in_labor_force",
    "DP02_0060PE":"pct_less_than_hs",
    "DP02_0068PE":"pct_ba_plus",
    "DP03_0074PE":"pct_snap_households",
    "DP03_0099PE":"pct_uninsured",
    "DP02_0072PE":"pct_disability",
    "DP02_0013PE":"pct_female_head_fam_with_children",
    "DP02_0011PE":"pct_single_parent_hh",
    "DP04_0046PE":"pct_owner_occupied",
    "DP04_0047PE":"pct_renter_occupied",
    "DP04_0003PE":"housing_vacancy_rate",
    "DP04_0078PE":"pct_overcrowded_gt1pproom",
    "DP04_0089PE":"pct_lacking_complete_kitchen",
    "DP04_0090PE":"pct_lacking_complete_plumbing",
    "DP04_0134E":"median_gross_rent",
    "DP04_0139PE":"pct_rent_ge_30pct_income",
    "DP04_0089E":"median_home_value",
    "DP04_0058PE":"pct_zero_vehicles",
    "DP03_0025E":"mean_travel_time_minutes",
    "DP03_0011PE":"pct_commute_drive_alone",
    "DP03_0012PE":"pct_commute_carpool",
    "DP03_0013PE":"pct_commute_public_transit",
    "DP03_0016PE":"pct_commute_walk",
    "DP03_0017PE":"pct_commute_other",
    "DP03_0024PE":"pct_work_from_home",
    "DP02_0154PE":"pct_broadband_sub"
}

In [40]:
frames = []
for st in states.STATES:
    geo = cd.censusgeo([('state', st.fips), ('county','*')])

    df_p = cd.download(DATASET_PROFILE, YEAR, geo, VARS_PROFILE, key=API_KEY)
    df_b = cd.download(DATASET_DETAILED, YEAR, geo, VARS_DETAILED, key=API_KEY)

    df = pd.concat([df_p, df_b], axis=1)

In [41]:
frames = []
for st in states.STATES:
    geo = cd.censusgeo([('state', st.fips), ('county','*')])
    df_p = cd.download(DATASET_PROFILE,  YEAR, geo, VARS_PROFILE,  key=API_KEY)
    df_b = cd.download(DATASET_DETAILED, YEAR, geo, VARS_DETAILED, key=API_KEY)
    df = pd.concat([df_p, df_b], axis=1)

    # get FIPS codes and names
    state_fips = [g.geo[0][1] for g in df.index]
    county_fips = [g.geo[1][1] for g in df.index]
    df = df.assign(
        state_fips=[s.zfill(2) for s in state_fips],
        county_fips=[c.zfill(3) for c in county_fips],
        geoid=[s.zfill(2) + c.zfill(3) for s,c in zip(state_fips, county_fips)],
        name=[g.name for g in df.index]
    ).reset_index(drop=True)

    frames.append(df)   # inside loop

acs_all = pd.concat(frames, ignore_index=True)  # outside loop

# check
print("Total counties:", len(acs_all))          # expect ~3143
print("Unique states:", acs_all['state_fips'].nunique())  # expect 51

Total counties: 3143
Unique states: 50


In [42]:
import pandas as pd

url = "https://github.com/worldterminator/worldterminator/raw/refs/heads/main/2022_Gaz_counties_national.txt"

gaz = pd.read_csv(
    url,
    sep="\t",
    dtype={"GEOID": str},
    low_memory=False
)

print("Gazetteer shape:", gaz.shape)
print("Columns:", list(gaz.columns)[:10])  # preview first 10 column names
print(gaz.head(3))

Gazetteer shape: (3222, 10)
Columns: ['USPS', 'GEOID', 'ANSICODE', 'NAME', 'ALAND', 'AWATER', 'ALAND_SQMI', 'AWATER_SQMI', 'INTPTLAT', 'INTPTLONG                                                                                                               ']
  USPS  GEOID  ANSICODE            NAME       ALAND      AWATER  ALAND_SQMI  \
0   AL  01001    161526  Autauga County  1539631461    25677536     594.455   
1   AL  01003    161527  Baldwin County  4117724893  1132887353    1589.863   
2   AL  01005    161528  Barbour County  2292160151    50523213     885.008   

   AWATER_SQMI   INTPTLAT  \
0        9.914  32.532237   
1      437.410  30.659218   
2       19.507  31.870253   

   INTPTLONG                                                                                                                 
0                                         -86.646440                                                                         
1                                         -87.746067   

In [43]:
# land area (sq mi), then we could have density
if "ALAND_SQMI" in gaz.columns:
    gaz["land_sqmi"] = gaz["ALAND_SQMI"]
else:
    gaz["land_sqmi"] = gaz["ALAND"] / (1609.344**2)

In [44]:
# merge with ACS
acs_all = acs_all.merge(
    gaz[["GEOID","land_sqmi"]],
    left_on="geoid",
    right_on="GEOID",
    how="left"
).drop(columns=["GEOID"])

# calculate density
acs_all["pop_total"] = pd.to_numeric(acs_all["B01003_001E"], errors="coerce")
acs_all["pop_density_per_sqmi"] = acs_all["pop_total"] / acs_all["land_sqmi"]

In [45]:
acs_all = acs_all.rename(columns=RENAME_MAP)

In [46]:
print(acs_all[["geoid","name","land_sqmi","pop_density_per_sqmi"]].head())

   geoid                     name  land_sqmi  pop_density_per_sqmi
0  01001  Autauga County, Alabama    594.455             98.848525
1  01003  Baldwin County, Alabama   1589.863            146.817682
2  01005  Barbour County, Alabama    885.008             28.109350
3  01007     Bibb County, Alabama    622.470             35.746301
4  01009   Blount County, Alabama    644.891             91.607729


In [47]:
print(acs_all.head())

   pct_under_18  pct_65plus  pct_hispanic  pct_black  pct_aian  pct_asian  \
0          23.4        15.6           2.2       96.8      72.6       19.6   
1          21.2        21.2           3.7       95.2      82.3        8.3   
2          20.7        19.8           5.2       95.2      44.6       46.9   
3          21.2        16.8           1.0       97.1      74.2       20.7   
4          23.0        18.3           4.6       90.3      85.7        1.2   

   pct_foreign_born  pct_limited_english  pct_poverty  med_hh_income  ...  \
0               1.9                 96.3          8.3          68315  ...   
1               1.0                 95.1          7.0          71039  ...   
2               0.4                 92.1         20.8          39712  ...   
3               1.1                 97.5         16.3          50669  ...   
4               0.5                 92.1         10.2          57440  ...   

   pct_broadband_sub  pop_total  gini_index  state_fips  county_fips  geoi

In [48]:
print("ncols:", len(acs_all.columns))
print(sorted(list(acs_all.columns))[:25])   # peek first 25 names alphabetically

ncols: 46
['county_fips', 'geoid', 'gini_index', 'housing_vacancy_rate', 'land_sqmi', 'mean_travel_time_minutes', 'med_hh_income', 'median_gross_rent', 'median_home_value', 'name', 'pct_65plus', 'pct_aian', 'pct_asian', 'pct_ba_plus', 'pct_black', 'pct_broadband_sub', 'pct_commute_carpool', 'pct_commute_drive_alone', 'pct_commute_other', 'pct_commute_public_transit', 'pct_commute_walk', 'pct_disability', 'pct_female_head_fam_with_children', 'pct_foreign_born', 'pct_hispanic']


In [49]:
# ensure, again, the IDs are strings and correct
acs_all["state_fips"]  = acs_all["state_fips"].astype(str).str.zfill(2)
acs_all["county_fips"] = acs_all["county_fips"].astype(str).str.zfill(3)
acs_all["geoid"]       = acs_all["geoid"].astype(str).str.zfill(5)

# row counts and join coverage
print("Rows (counties):", len(acs_all))
print("Missing land_sqmi:", acs_all["land_sqmi"].isna().sum())

# spot checks
for c in ["pct_poverty","pct_unemployed","pct_ba_plus","pct_broadband_sub"]:
    if c in acs_all.columns:
        print(c, "range:", float(acs_all[c].min()), "→", float(acs_all[c].max()))

Rows (counties): 3143
Missing land_sqmi: 0
pct_poverty range: 0.0 → 51.5
pct_unemployed range: 0.0 → 20.9
pct_ba_plus range: 0.0 → 78.9
pct_broadband_sub range: 36.0 → 100.0


In [50]:
acs_all["pct_broadband_sub"].describe()




Unnamed: 0,pct_broadband_sub
count,3143.0
mean,82.444257
std,7.22362
min,36.0
25%,78.7
50%,83.5
75%,87.4
max,100.0


In [51]:
acs_all.to_csv("acs_county_2022.csv", index=False)