<a href="https://colab.research.google.com/github/worldterminator/mess/blob/main/hungergame.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip -q install censusdata us requests pandas

import os, io, zipfile, requests, pandas as pd
import censusdata as cd
from us import states

# Get one: https://api.census.gov/data/key_signup.html
os.environ['CENSUS_KEY'] = ''
API_KEY = os.environ.get('CENSUS_KEY', None)

YEAR = 2022   # 2018–2022 5-year
DATASET_PROFILE = 'acs/acs5/profile'
DATASET_SUBJECT = 'acs/acs5/subject'
DATASET_DETAILED = 'acs/acs5'


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m355.9/355.9 kB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for censusdata (setup.py) ... [?25l[?25hdone


In [None]:
VARS_PROFILE = [
    # Demography
    "DP05_0019PE","DP05_0024PE","DP05_0071PE","DP05_0078PE","DP05_0079PE","DP05_0080PE",
    "DP02_0093PE","DP02_0113PE",
    # Socioeconomic
    "DP03_0119PE","DP03_0062E","DP03_0005PE","DP03_0002PE","DP02_0060PE","DP02_0068PE",
    "DP03_0074PE","DP03_0099PE","DP02_0072PE",
    # Household structure
    "DP02_0013PE","DP02_0011PE",
    # Housing / transport
    "DP04_0046PE","DP04_0047PE","DP04_0003PE","DP04_0078PE","DP04_0089PE","DP04_0090PE",
    "DP04_0134E","DP04_0139PE","DP04_0089E","DP04_0058PE",
    "DP03_0025E","DP03_0011PE","DP03_0012PE","DP03_0013PE","DP03_0016PE","DP03_0017PE","DP03_0024PE",
    "DP02_0154PE"  # broadband subscription (%)
]

In [None]:
VARS_DETAILED = [
    "B01003_001E",   # total population
    "B19083_001E"    # Gini index
]

In [None]:
RENAME_MAP = {
    "B01003_001E":"pop_total",
    "B19083_001E":"gini_index",
    "DP05_0019PE":"pct_under_18",
    "DP05_0024PE":"pct_65plus",
    "DP05_0071PE":"pct_hispanic",
    "DP05_0078PE":"pct_black",
    "DP05_0079PE":"pct_aian",
    "DP05_0080PE":"pct_asian",
    "DP02_0093PE":"pct_foreign_born",
    "DP02_0113PE":"pct_limited_english",
    "DP03_0119PE":"pct_poverty",
    "DP03_0062E":"med_hh_income",
    "DP03_0005PE":"pct_unemployed",
    "DP03_0002PE":"pct_in_labor_force",
    "DP02_0060PE":"pct_less_than_hs",
    "DP02_0068PE":"pct_ba_plus",
    "DP03_0074PE":"pct_snap_households",
    "DP03_0099PE":"pct_uninsured",
    "DP02_0072PE":"pct_disability",
    "DP02_0013PE":"pct_female_head_fam_with_children",
    "DP02_0011PE":"pct_single_parent_hh",
    "DP04_0046PE":"pct_owner_occupied",
    "DP04_0047PE":"pct_renter_occupied",
    "DP04_0003PE":"housing_vacancy_rate",
    "DP04_0078PE":"pct_overcrowded_gt1pproom",
    "DP04_0089PE":"pct_lacking_complete_kitchen",
    "DP04_0090PE":"pct_lacking_complete_plumbing",
    "DP04_0134E":"median_gross_rent",
    "DP04_0139PE":"pct_rent_ge_30pct_income",
    "DP04_0089E":"median_home_value",
    "DP04_0058PE":"pct_zero_vehicles",
    "DP03_0025E":"mean_travel_time_minutes",
    "DP03_0011PE":"pct_commute_drive_alone",
    "DP03_0012PE":"pct_commute_carpool",
    "DP03_0013PE":"pct_commute_public_transit",
    "DP03_0016PE":"pct_commute_walk",
    "DP03_0017PE":"pct_commute_other",
    "DP03_0024PE":"pct_work_from_home",
    "DP02_0154PE":"pct_broadband_sub"
}

In [None]:
frames = []
for st in states.STATES:
    geo = cd.censusgeo([('state', st.fips), ('county','*')])

    df_p = cd.download(DATASET_PROFILE, YEAR, geo, VARS_PROFILE, key=API_KEY)
    df_b = cd.download(DATASET_DETAILED, YEAR, geo, VARS_DETAILED, key=API_KEY)

    df = pd.concat([df_p, df_b], axis=1)

In [None]:
frames = []
for st in states.STATES:
    geo = cd.censusgeo([('state', st.fips), ('county','*')])
    df_p = cd.download(DATASET_PROFILE,  YEAR, geo, VARS_PROFILE,  key=API_KEY)
    df_b = cd.download(DATASET_DETAILED, YEAR, geo, VARS_DETAILED, key=API_KEY)
    df = pd.concat([df_p, df_b], axis=1)

    # get FIPS codes and names
    state_fips = [g.geo[0][1] for g in df.index]
    county_fips = [g.geo[1][1] for g in df.index]
    df = df.assign(
        state_fips=[s.zfill(2) for s in state_fips],
        county_fips=[c.zfill(3) for c in county_fips],
        geoid=[s.zfill(2) + c.zfill(3) for s,c in zip(state_fips, county_fips)],
        name=[g.name for g in df.index]
    ).reset_index(drop=True)

    frames.append(df)   # inside loop

acs_all = pd.concat(frames, ignore_index=True)  # outside loop

# check
print("Total counties:", len(acs_all))          # expect ~3143
print("Unique states:", acs_all['state_fips'].nunique())  # expect 51

Total counties: 3143
Unique states: 50


In [None]:
import pandas as pd

url = "https://github.com/worldterminator/worldterminator/raw/refs/heads/main/2022_Gaz_counties_national.txt"

gaz = pd.read_csv(
    url,
    sep="\t",
    dtype={"GEOID": str},
    low_memory=False
)

print("Gazetteer shape:", gaz.shape)
print("Columns:", list(gaz.columns)[:10])  # preview first 10 column names
print(gaz.head(3))

Gazetteer shape: (3222, 10)
Columns: ['USPS', 'GEOID', 'ANSICODE', 'NAME', 'ALAND', 'AWATER', 'ALAND_SQMI', 'AWATER_SQMI', 'INTPTLAT', 'INTPTLONG                                                                                                               ']
  USPS  GEOID  ANSICODE            NAME       ALAND      AWATER  ALAND_SQMI  \
0   AL  01001    161526  Autauga County  1539631461    25677536     594.455   
1   AL  01003    161527  Baldwin County  4117724893  1132887353    1589.863   
2   AL  01005    161528  Barbour County  2292160151    50523213     885.008   

   AWATER_SQMI   INTPTLAT  \
0        9.914  32.532237   
1      437.410  30.659218   
2       19.507  31.870253   

   INTPTLONG                                                                                                                 
0                                         -86.646440                                                                         
1                                         -87.746067   

In [None]:
# land area (sq mi), then we could have density
if "ALAND_SQMI" in gaz.columns:
    gaz["land_sqmi"] = gaz["ALAND_SQMI"]
else:
    gaz["land_sqmi"] = gaz["ALAND"] / (1609.344**2)

In [None]:
# merge with ACS
acs_all = acs_all.merge(
    gaz[["GEOID","land_sqmi"]],
    left_on="geoid",
    right_on="GEOID",
    how="left"
).drop(columns=["GEOID"])

# calculate density
acs_all["pop_total"] = pd.to_numeric(acs_all["B01003_001E"], errors="coerce")
acs_all["pop_density_per_sqmi"] = acs_all["pop_total"] / acs_all["land_sqmi"]

In [None]:
acs_all = acs_all.rename(columns=RENAME_MAP)

In [None]:
print(acs_all[["geoid","name","land_sqmi","pop_density_per_sqmi"]].head())

   geoid                     name  land_sqmi  pop_density_per_sqmi
0  01001  Autauga County, Alabama    594.455             98.848525
1  01003  Baldwin County, Alabama   1589.863            146.817682
2  01005  Barbour County, Alabama    885.008             28.109350
3  01007     Bibb County, Alabama    622.470             35.746301
4  01009   Blount County, Alabama    644.891             91.607729


In [None]:
print(acs_all.head())

   pct_under_18  pct_65plus  pct_hispanic  pct_black  pct_aian  pct_asian  \
0          23.4        15.6           2.2       96.8      72.6       19.6   
1          21.2        21.2           3.7       95.2      82.3        8.3   
2          20.7        19.8           5.2       95.2      44.6       46.9   
3          21.2        16.8           1.0       97.1      74.2       20.7   
4          23.0        18.3           4.6       90.3      85.7        1.2   

   pct_foreign_born  pct_limited_english  pct_poverty  med_hh_income  ...  \
0               1.9                 96.3          8.3          68315  ...   
1               1.0                 95.1          7.0          71039  ...   
2               0.4                 92.1         20.8          39712  ...   
3               1.1                 97.5         16.3          50669  ...   
4               0.5                 92.1         10.2          57440  ...   

   pct_broadband_sub  pop_total  gini_index  state_fips  county_fips  geoi

In [None]:
print("ncols:", len(acs_all.columns))
print(sorted(list(acs_all.columns))[:25])   # peek first 25 names alphabetically

ncols: 46
['county_fips', 'geoid', 'gini_index', 'housing_vacancy_rate', 'land_sqmi', 'mean_travel_time_minutes', 'med_hh_income', 'median_gross_rent', 'median_home_value', 'name', 'pct_65plus', 'pct_aian', 'pct_asian', 'pct_ba_plus', 'pct_black', 'pct_broadband_sub', 'pct_commute_carpool', 'pct_commute_drive_alone', 'pct_commute_other', 'pct_commute_public_transit', 'pct_commute_walk', 'pct_disability', 'pct_female_head_fam_with_children', 'pct_foreign_born', 'pct_hispanic']


In [None]:
# ensure, again, the IDs are strings and correct
acs_all["state_fips"]  = acs_all["state_fips"].astype(str).str.zfill(2)
acs_all["county_fips"] = acs_all["county_fips"].astype(str).str.zfill(3)
acs_all["geoid"]       = acs_all["geoid"].astype(str).str.zfill(5)

# row counts and join coverage
print("Rows (counties):", len(acs_all))
print("Missing land_sqmi:", acs_all["land_sqmi"].isna().sum())

# spot checks
for c in ["pct_poverty","pct_unemployed","pct_ba_plus","pct_broadband_sub"]:
    if c in acs_all.columns:
        print(c, "range:", float(acs_all[c].min()), "→", float(acs_all[c].max()))

Rows (counties): 3143
Missing land_sqmi: 0
pct_poverty range: 0.0 → 51.5
pct_unemployed range: 0.0 → 20.9
pct_ba_plus range: 0.0 → 78.9
pct_broadband_sub range: 36.0 → 100.0


In [None]:
acs_all["pct_broadband_sub"].describe()




Unnamed: 0,pct_broadband_sub
count,3143.0
mean,82.444257
std,7.22362
min,36.0
25%,78.7
50%,83.5
75%,87.4
max,100.0


In [None]:
acs_all.to_csv("acs_county_2022.csv", index=False)

#merge with SVI and ADI

In [34]:
import pandas as pd
acs = pd.read_csv("/content/acs_county_2022.csv", low_memory=False)

print(acs.shape)
print("\nACS columns:", acs.columns.tolist()[:20])  # peek-
print("\nACS variable list:\n")
for i, col in enumerate(acs.columns, 1):
    print(f"{i:3}. {col}")


(3143, 46)

ACS columns: ['pct_under_18', 'pct_65plus', 'pct_hispanic', 'pct_black', 'pct_aian', 'pct_asian', 'pct_foreign_born', 'pct_limited_english', 'pct_poverty', 'med_hh_income', 'pct_unemployed', 'pct_in_labor_force', 'pct_less_than_hs', 'pct_ba_plus', 'pct_snap_households', 'pct_uninsured', 'pct_disability', 'pct_female_head_fam_with_children', 'pct_single_parent_hh', 'pct_owner_occupied']

ACS variable list:

  1. pct_under_18
  2. pct_65plus
  3. pct_hispanic
  4. pct_black
  5. pct_aian
  6. pct_asian
  7. pct_foreign_born
  8. pct_limited_english
  9. pct_poverty
 10. med_hh_income
 11. pct_unemployed
 12. pct_in_labor_force
 13. pct_less_than_hs
 14. pct_ba_plus
 15. pct_snap_households
 16. pct_uninsured
 17. pct_disability
 18. pct_female_head_fam_with_children
 19. pct_single_parent_hh
 20. pct_owner_occupied
 21. pct_renter_occupied
 22. housing_vacancy_rate
 23. pct_overcrowded_gt1pproom
 24. pct_lacking_complete_kitchen
 25. pct_lacking_complete_plumbing
 26. median_

In [35]:
svi = pd.read_csv("/content/SVI_2022_US_county.csv", low_memory=False)

print(svi.shape)
print("\nSVI variable list:\n")
for i, col in enumerate(svi.columns, 1):
    print(f"{i:3}. {col}")

(3144, 158)

SVI variable list:

  1. ST
  2. STATE
  3. ST_ABBR
  4. STCNTY
  5. COUNTY
  6. FIPS
  7. LOCATION
  8. AREA_SQMI
  9. E_TOTPOP
 10. M_TOTPOP
 11. E_HU
 12. M_HU
 13. E_HH
 14. M_HH
 15. E_POV150
 16. M_POV150
 17. E_UNEMP
 18. M_UNEMP
 19. E_HBURD
 20. M_HBURD
 21. E_NOHSDP
 22. M_NOHSDP
 23. E_UNINSUR
 24. M_UNINSUR
 25. E_AGE65
 26. M_AGE65
 27. E_AGE17
 28. M_AGE17
 29. E_DISABL
 30. M_DISABL
 31. E_SNGPNT
 32. M_SNGPNT
 33. E_LIMENG
 34. M_LIMENG
 35. E_MINRTY
 36. M_MINRTY
 37. E_MUNIT
 38. M_MUNIT
 39. E_MOBILE
 40. M_MOBILE
 41. E_CROWD
 42. M_CROWD
 43. E_NOVEH
 44. M_NOVEH
 45. E_GROUPQ
 46. M_GROUPQ
 47. EP_POV150
 48. MP_POV150
 49. EP_UNEMP
 50. MP_UNEMP
 51. EP_HBURD
 52. MP_HBURD
 53. EP_NOHSDP
 54. MP_NOHSDP
 55. EP_UNINSUR
 56. MP_UNINSUR
 57. EP_AGE65
 58. MP_AGE65
 59. EP_AGE17
 60. MP_AGE17
 61. EP_DISABL
 62. MP_DISABL
 63. EP_SNGPNT
 64. MP_SNGPNT
 65. EP_LIMENG
 66. MP_LIMENG
 67. EP_MINRTY
 68. MP_MINRTY
 69. EP_MUNIT
 70. MP_MUNIT
 71. EP_MOBILE
 

In [36]:
adi = pd.read_csv("/content/US_2023_ADI_Census_Block_Group_v4_0_1.csv", low_memory=False)

print(adi.shape)
print("\nADI variable list:\n")
for i, col in enumerate(adi.columns, 1):
    print(f"{i:3}. {col}")


(242336, 5)

ADI variable list:

  1. Unnamed: 0
  2. GISJOIN
  3. FIPS
  4. ADI_NATRANK
  5. ADI_STATERNK


In [5]:
adi.head(5)

Unnamed: 0.1,Unnamed: 0,GISJOIN,FIPS,ADI_NATRANK,ADI_STATERNK
0,1,G01000100201001,10010201001,71,4
1,2,G01000100201002,10010201002,79,5
2,3,G01000100202001,10010202001,87,7
3,4,G01000100202002,10010202002,84,6
4,5,G01000100203001,10010203001,76,5


## avi-acs merge first, then stage adi (aggregate)

In [37]:
acs["fips"] = acs["geoid"].astype(str).str.zfill(5)
svi["fips"] = svi["FIPS"].astype(str)
svi["fips"] = svi["fips"].str.extract(r"(\d+)")[0].str.zfill(5)

In [38]:
svi.head(8)

Unnamed: 0,ST,STATE,ST_ABBR,STCNTY,COUNTY,FIPS,LOCATION,AREA_SQMI,E_TOTPOP,M_TOTPOP,...,MP_ASIAN,EP_AIAN,MP_AIAN,EP_NHPI,MP_NHPI,EP_TWOMORE,MP_TWOMORE,EP_OTHERRACE,MP_OTHERRACE,fips
0,1,Alabama,AL,1001,Autauga County,1001,"Autauga County, Alabama",594.454786,58761,0,...,0.4,0.1,0.1,0.0,0.1,3.3,1.0,0.2,0.3,1001
1,1,Alabama,AL,1003,Baldwin County,1003,"Baldwin County, Alabama",1589.861817,233420,0,...,0.1,0.2,0.1,0.0,0.1,3.1,0.4,0.4,0.3,1003
2,1,Alabama,AL,1005,Barbour County,1005,"Barbour County, Alabama",885.007619,24877,0,...,0.1,0.3,0.1,0.0,0.1,1.8,0.7,1.2,0.8,1005
3,1,Alabama,AL,1007,Bibb County,1007,"Bibb County, Alabama",622.469286,22251,0,...,0.4,0.1,0.1,0.0,0.2,1.7,1.0,0.1,0.1,1007
4,1,Alabama,AL,1009,Blount County,1009,"Blount County, Alabama",644.890376,59077,0,...,0.2,0.1,0.1,0.2,0.2,2.8,0.7,0.1,0.1,1009
5,1,Alabama,AL,1011,Bullock County,1011,"Bullock County, Alabama",622.814753,10328,0,...,0.5,0.0,0.4,0.0,0.4,1.3,1.1,0.0,0.4,1011
6,1,Alabama,AL,1013,Butler County,1013,"Butler County, Alabama",776.838208,18981,0,...,0.3,0.4,0.3,0.0,0.2,1.2,0.6,0.0,0.1,1013
7,1,Alabama,AL,1015,Calhoun County,1015,"Calhoun County, Alabama",605.889936,116162,0,...,0.1,0.1,0.1,0.2,0.2,2.5,0.3,0.4,0.2,1015


In [39]:
acs_svi = acs.merge(svi, left_on="fips", right_on="fips", how="left")

# checks
matched = acs_svi["fips"].notna().sum()
print(f"ACS rows: {len(acs):,} | Matched SVI rows: {matched:,}")

ACS rows: 3,143 | Matched SVI rows: 3,143


In [40]:
acs_svi.head(5)

Unnamed: 0,pct_under_18,pct_65plus,pct_hispanic,pct_black,pct_aian,pct_asian,pct_foreign_born,pct_limited_english,pct_poverty,med_hh_income,...,EP_ASIAN,MP_ASIAN,EP_AIAN,MP_AIAN,EP_NHPI,MP_NHPI,EP_TWOMORE,MP_TWOMORE,EP_OTHERRACE,MP_OTHERRACE
0,23.4,15.6,2.2,96.8,72.6,19.6,1.9,96.3,8.3,68315,...,1.1,0.4,0.1,0.1,0.0,0.1,3.3,1.0,0.2,0.3
1,21.2,21.2,3.7,95.2,82.3,8.3,1.0,95.1,7.0,71039,...,0.9,0.1,0.2,0.1,0.0,0.1,3.1,0.4,0.4,0.3
2,20.7,19.8,5.2,95.2,44.6,46.9,0.4,92.1,20.8,39712,...,0.5,0.1,0.3,0.1,0.0,0.1,1.8,0.7,1.2,0.8
3,21.2,16.8,1.0,97.1,74.2,20.7,1.1,97.5,16.3,50669,...,0.3,0.4,0.1,0.1,0.0,0.2,1.7,1.0,0.1,0.1
4,23.0,18.3,4.6,90.3,85.7,1.2,0.5,92.1,10.2,57440,...,0.2,0.2,0.1,0.1,0.2,0.2,2.8,0.7,0.1,0.1


In [41]:
svi.sort_values("fips").head()

Unnamed: 0,ST,STATE,ST_ABBR,STCNTY,COUNTY,FIPS,LOCATION,AREA_SQMI,E_TOTPOP,M_TOTPOP,...,MP_ASIAN,EP_AIAN,MP_AIAN,EP_NHPI,MP_NHPI,EP_TWOMORE,MP_TWOMORE,EP_OTHERRACE,MP_OTHERRACE,fips
0,1,Alabama,AL,1001,Autauga County,1001,"Autauga County, Alabama",594.454786,58761,0,...,0.4,0.1,0.1,0.0,0.1,3.3,1.0,0.2,0.3,1001
1,1,Alabama,AL,1003,Baldwin County,1003,"Baldwin County, Alabama",1589.861817,233420,0,...,0.1,0.2,0.1,0.0,0.1,3.1,0.4,0.4,0.3,1003
2,1,Alabama,AL,1005,Barbour County,1005,"Barbour County, Alabama",885.007619,24877,0,...,0.1,0.3,0.1,0.0,0.1,1.8,0.7,1.2,0.8,1005
3,1,Alabama,AL,1007,Bibb County,1007,"Bibb County, Alabama",622.469286,22251,0,...,0.4,0.1,0.1,0.0,0.2,1.7,1.0,0.1,0.1,1007
4,1,Alabama,AL,1009,Blount County,1009,"Blount County, Alabama",644.890376,59077,0,...,0.2,0.1,0.1,0.2,0.2,2.8,0.7,0.1,0.1,1009


In [42]:
acs["fips"] = acs["fips"].astype(str).str.zfill(5)
acs[acs["fips"] == '01003']

Unnamed: 0,pct_under_18,pct_65plus,pct_hispanic,pct_black,pct_aian,pct_asian,pct_foreign_born,pct_limited_english,pct_poverty,med_hh_income,...,pop_total,gini_index,state_fips,county_fips,geoid,name,land_sqmi,pop_total.1,pop_density_per_sqmi,fips
1,21.2,21.2,3.7,95.2,82.3,8.3,1.0,95.1,7.0,71039,...,233420,0.4648,1,3,1003,"Baldwin County, Alabama",1589.863,233420,146.817682,1003


In [43]:
acs_svi[acs_svi["fips"] == '01003']

Unnamed: 0,pct_under_18,pct_65plus,pct_hispanic,pct_black,pct_aian,pct_asian,pct_foreign_born,pct_limited_english,pct_poverty,med_hh_income,...,EP_ASIAN,MP_ASIAN,EP_AIAN,MP_AIAN,EP_NHPI,MP_NHPI,EP_TWOMORE,MP_TWOMORE,EP_OTHERRACE,MP_OTHERRACE
1,21.2,21.2,3.7,95.2,82.3,8.3,1.0,95.1,7.0,71039,...,0.9,0.1,0.2,0.1,0.0,0.1,3.1,0.4,0.4,0.3


In [44]:
out_path = "/acs_county_2022_with_svi.csv"
acs_svi.to_csv(out_path, index=False)
print("Saved:", out_path)

Saved: /acs_county_2022_with_svi.csv


In [45]:
adi["FIPS"] = adi["FIPS"].astype(str).str.extract(r"(\d+)")[0].str.zfill(12)  # block-group FIPS (12d)
adi["county_fips"] = adi["FIPS"].str[:5]  # helper only

In [46]:
# sanity is a good thing, maybe
print("FIPS lengths:", adi["FIPS"].str.len().value_counts().to_dict())
print("county_fips lengths:", adi["county_fips"].str.len().value_counts().to_dict())

FIPS lengths: {12: 242336}
county_fips lengths: {5: 242336}


In [47]:
adi.head(5)

Unnamed: 0.1,Unnamed: 0,GISJOIN,FIPS,ADI_NATRANK,ADI_STATERNK,county_fips
0,1,G01000100201001,10010201001,71,4,1001
1,2,G01000100201002,10010201002,79,5,1001
2,3,G01000100202001,10010202001,87,7,1001
3,4,G01000100202002,10010202002,84,6,1001
4,5,G01000100203001,10010203001,76,5,1001


In [48]:
# stage export
adi_out = "/adi_with_county_fips.csv"
adi.to_csv(adi_out, index=False)
print("Saved:", adi_out)

Saved: /adi_with_county_fips.csv
