In [4]:
import pandas as pd
from utilities.functions import *
from utilities.luts import *
import math

In [5]:
# load the lookup table
geoid_lu_df = pd.read_csv("tbl/NCRPlaces_Census_04192024.csv")

# to test, limit the lookup table to a few locations to test each geography type
# we include an individual county, incorporated place, census designated place,
# and a 1-to-many entry (Eagle River) that has multiple census tracts per single GVV ID
geoid_lu_df_test = geoid_lu_df[
    geoid_lu_df["name"].isin(
        ["Eagle River", "Fairbanks", "Arctic Village", "Fort Yukon"]
    )
]

In [6]:
geoid_lu_df_test

Unnamed: 0,id,name,alt_name,region,country,latitude,longitude,type,GEOIDFQ,PLACENAME,AREATYPE,COMMENT
10,AK124,Fairbanks,,Alaska,US,64.8378,-147.716,community,0500000US02090,Fairbanks North Star Borough,County,Data represent information from nearest [COLUM...
45,AK103,Eagle River,,Alaska,US,61.3221,-149.567,community,1400000US02020000201,Census Tract 2.01,Census tract,Data for this place represent multiple merged ...
46,AK103,Eagle River,,Alaska,US,61.3221,-149.567,community,1400000US02020000202,Census Tract 2.02,Census tract,Data for this place represent multiple merged ...
47,AK103,Eagle River,,Alaska,US,61.3221,-149.567,community,1400000US02020000204,Census Tract 2.04,Census tract,Data for this place represent multiple merged ...
48,AK103,Eagle River,,Alaska,US,61.3221,-149.567,community,1400000US02020000205,Census Tract 2.05,Census tract,Data for this place represent multiple merged ...
49,AK103,Eagle River,,Alaska,US,61.3221,-149.567,community,1400000US02020000206,Census Tract 2.06,Census tract,Data for this place represent multiple merged ...
77,AK22,Arctic Village,Vashrąįį K'ǫǫ,Alaska,US,68.1269,-145.538,community,1600000US0203990,Arctic Village CDP,Census designated place,Data represent information from nearest [COLUM...
166,AK130,Fort Yukon,Gwichyaa Zheh,Alaska,US,66.5647,-145.274,community,1600000US0226760,Fort Yukon city,Incorporated place,Data represent information from nearest [COLUM...


In [7]:
# use the individual fetch functions get data for GVV IDs in the test lookup table
# follow the printed URLs to QC individual values from the results table
for gvv_id in list(geoid_lu_df_test.id.unique()):
    dhc = fetch_census_data_and_compute("dhc", gvv_id, geoid_lu_df_test, print_url=True)
    print(dhc)
    acs5 = fetch_census_data_and_compute(
        "acs5", gvv_id, geoid_lu_df_test, print_url=True
    )
    print(acs5)
    cdc = fetch_cdc_data_and_compute(gvv_id, geoid_lu_df_test, print_url=True)
    print(cdc)

Requesting US Census data from: https://api.census.gov/data/2020/dec/dhc?get=P12_001N,P12_002N,P12_026N,P12_003N,P12_004N,P12_005N,P12_006N,P12_020N,P12_021N,P12_022N,P12_023N,P12_024N,P12_025N,P12_027N,P12_028N,P12_029N,P12_030N,P12_044N,P12_045N,P12_046N,P12_047N,P12_048N,P12_049N,P9_001N,P9_002N,P9_005N,P9_006N,P9_007N,P9_008N,P9_009N,P9_010N,P9_011N&for=county:090&in=state:02&key=839fc96162a9e16e7896434e7592eccaf7938706
  GEOID  total_population  pct_65_plus  pct_under_18  pct_under_5  \
0   090           95655.0        11.46         23.95          7.0   

   pct_hispanic_latino  pct_white  pct_african_american  \
0                 7.65      66.05                  3.98   

   pct_amer_indian_ak_native  pct_asian  pct_hawaiian_pacislander  pct_other  \
0                       7.63       3.17                      0.61       0.85   

   pct_multi  
0      10.06  
Requesting US Census data from: https://api.census.gov/data/2023/acs/acs5/subject?get=S1810_C03_001E,S1810_C03_001M,S2701_C

In [8]:
# also test the fetch_and_merge() function, which will add the comments from the comment dict
test_results = []
test_comment_dict = create_comment_dict(geoid_lu_df_test)

for gvv_id in list(geoid_lu_df_test.id.unique()):
    test_results.append(fetch_and_merge(geoid_lu_df_test, gvv_id, test_comment_dict))
test_results_df = pd.concat(test_results)

# view just the geographic info and comments
pd.options.display.max_colwidth = 999
test_results_df[["id", "name", "areatype", "placename", "comment"]]

Unnamed: 0,id,name,areatype,placename,comment
0,AK124,Fairbanks,County,Fairbanks North Star Borough,"Data represent information from nearest county (Fairbanks North Star Borough), which includes Fairbanks."
0,AK103,Eagle River,Census tract,Census Tract 2.01,"Data for this place represent multiple merged census tracts: Census Tract 2.01, Census Tract 2.02, Census Tract 2.04, Census Tract 2.05, and Census Tract 2.06"
1,AK103,Eagle River,Census tract,Census Tract 2.02,"Data for this place represent multiple merged census tracts: Census Tract 2.01, Census Tract 2.02, Census Tract 2.04, Census Tract 2.05, and Census Tract 2.06"
2,AK103,Eagle River,Census tract,Census Tract 2.04,"Data for this place represent multiple merged census tracts: Census Tract 2.01, Census Tract 2.02, Census Tract 2.04, Census Tract 2.05, and Census Tract 2.06"
3,AK103,Eagle River,Census tract,Census Tract 2.05,"Data for this place represent multiple merged census tracts: Census Tract 2.01, Census Tract 2.02, Census Tract 2.04, Census Tract 2.05, and Census Tract 2.06"
4,AK103,Eagle River,Census tract,Census Tract 2.06,"Data for this place represent multiple merged census tracts: Census Tract 2.01, Census Tract 2.02, Census Tract 2.04, Census Tract 2.05, and Census Tract 2.06"
0,AK22,Arctic Village,Census designated place,Arctic Village CDP,"Data represent information from nearest census designated place (Arctic Village CDP), which includes Arctic Village."
0,AK130,Fort Yukon,Incorporated place,Fort Yukon city,"Data represent information from nearest incorporated place (Fort Yukon city), which includes Fort Yukon."


In [9]:
# we will also test the run_fetch_and_merge() function, as this adds the additional reference data for state of AK and the US
test_run_results_df = run_fetch_and_merge(geoid_lu_df_test)
test_run_results_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[len(df.index)] = ak_row
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[len(df.index)] = us_row


Unnamed: 0,id,name,areatype,placename,GEOID,total_population,pct_65_plus,pct_under_18,pct_under_5,pct_hispanic_latino,...,pct_unemployed,pct_no_bband_moe,pct_crowding_moe,pct_hcost_moe,pct_no_hsdiploma_moe,pct_below_150pov_moe,pct_minority_moe,pct_single_parent_moe,pct_unemployed_moe,comment
0,AK124,Fairbanks,County,Fairbanks North Star Borough,90,95655.0,11.46,23.95,7.0,7.65,...,6.3,2.5,1.0,2.3,0.8,1.4,1.3,1.1,1.4,"Data represent information from nearest county (Fairbanks North Star Borough), which includes Fairbanks."
0,AK103,Eagle River,Census tract,Census Tract 2.01,20000201,4318.0,7.74,26.98,7.5,9.59,...,3.8,10.5,4.0,10.0,3.7,6.1,9.7,1.3,2.3,"Data for this place represent multiple merged census tracts: Census Tract 2.01, Census Tract 2.02, Census Tract 2.04, Census Tract 2.05, and Census Tract 2.06"
1,AK103,Eagle River,Census tract,Census Tract 2.02,20000202,6384.0,12.97,25.5,6.74,7.5,...,1.3,14.3,0.8,7.4,2.9,3.4,7.3,6.2,1.2,"Data for this place represent multiple merged census tracts: Census Tract 2.01, Census Tract 2.02, Census Tract 2.04, Census Tract 2.05, and Census Tract 2.06"
2,AK103,Eagle River,Census tract,Census Tract 2.04,20000204,3582.0,12.65,21.3,4.91,5.3,...,2.8,15.9,3.2,7.4,1.2,4.7,7.6,2.9,2.0,"Data for this place represent multiple merged census tracts: Census Tract 2.01, Census Tract 2.02, Census Tract 2.04, Census Tract 2.05, and Census Tract 2.06"
3,AK103,Eagle River,Census tract,Census Tract 2.05,20000205,7421.0,7.16,30.64,8.09,9.03,...,3.7,13.5,1.2,11.4,1.2,2.5,4.8,2.3,2.5,"Data for this place represent multiple merged census tracts: Census Tract 2.01, Census Tract 2.02, Census Tract 2.04, Census Tract 2.05, and Census Tract 2.06"
4,AK103,Eagle River,Census tract,Census Tract 2.06,20000206,3413.0,13.45,28.13,6.39,6.36,...,2.5,8.6,3.1,6.2,1.3,2.4,8.0,3.7,2.5,"Data for this place represent multiple merged census tracts: Census Tract 2.01, Census Tract 2.02, Census Tract 2.04, Census Tract 2.05, and Census Tract 2.06"
0,AK22,Arctic Village,Census designated place,Arctic Village CDP,3990,151.0,7.28,31.13,3.31,0.0,...,11.3,26.3,12.9,24.0,9.7,16.0,14.4,17.7,9.6,"Data represent information from nearest census designated place (Arctic Village CDP), which includes Arctic Village."
0,AK130,Fort Yukon,Incorporated place,Fort Yukon city,26760,428.0,15.42,26.17,6.31,0.7,...,22.7,3.2,5.8,7.8,6.9,5.6,7.5,5.1,8.4,"Data represent information from nearest incorporated place (Fort Yukon city), which includes Fort Yukon."
0,AK0,Alaska,State,Alaska,2,733391.0,12.98,24.46,6.56,6.79,...,7.21,11.17,3.41,7.51,3.17,5.84,7.75,3.71,3.7,
0,US0,United States,Nation,United States,1,331449281.0,16.83,22.06,5.55,18.73,...,5.64,12.85,3.38,9.5,4.73,7.77,8.49,4.84,4.0,


In [10]:
# if test results look good, run the fetch and merge with the full lookup table
# even with multiprocessing, this might take a while to complete the many calls to the various APIs
# heavy API usage may be "throttled" by the service ... ~ 2-3 minutes seemed to be the norm here
# If you get any "no response" messages, that means you will be missing data for some places and you'll need to run this cell again
results_df = run_fetch_and_merge(geoid_lu_df)

  return pd.concat(results)


In [11]:
# then aggregate any rows with duplicate IDs
aggregated_results_df = aggregate_results(results_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[measure_col + "_adult_population_variance"].iloc[
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[measure_col + "_adult_population_variance"].iloc[
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[measure_col + "_adult_population_variance"].iloc[
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[measur

Aggregating values for AK103: Eagle River
Aggregating values for AK439: Joint Base Elmendorf-Richardson


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df[pooled_sd_col_name] = pooled_sd
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df[pooled_sd_col_name] = pooled_sd
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df[pooled_sd_col_name] = pooled_sd
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

In [12]:
aggregated_results_df

Unnamed: 0,id,name,areatype,placename,GEOID,total_population,pct_65_plus,pct_under_18,pct_under_5,pct_hispanic_latino,...,pct_no_hsdiploma_high,pct_no_hsdiploma_low,pct_below_150pov_high,pct_below_150pov_low,pct_minority_high,pct_minority_low,pct_single_parent_high,pct_single_parent_low,pct_unemployed_high,pct_unemployed_low
0,BORO1,Aleutians East Borough,County,Aleutians East Borough,013,3420.0,6.87,8.77,2.05,19.71,...,17.80,12.80,25.90,19.50,95.30,79.10,10.10,5.70,6.10,2.90
1,BORO19,Municipality of Anchorage,County,Anchorage Municipality,020,291247.0,12.41,23.46,6.43,9.08,...,6.30,5.30,16.10,14.10,44.80,43.00,6.60,5.40,6.00,4.80
2,AK15,Anchorage,County,Anchorage Municipality,020,291247.0,12.41,23.46,6.43,9.08,...,6.30,5.30,16.10,14.10,44.80,43.00,6.60,5.40,6.00,4.80
3,CENS9,Bethel Census Area,County,Bethel Census Area,050,18666.0,8.24,35.02,9.61,1.11,...,19.80,16.20,47.40,40.40,91.70,89.90,10.90,6.70,21.10,15.10
4,BORO17,Bristol Bay Borough,County,Bristol Bay Borough,060,844.0,16.11,22.04,5.33,5.33,...,9.30,1.30,10.40,5.60,66.60,51.00,6.10,1.50,4.40,0.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
420,AK435,Yakutat,Census designated place,Yakutat CDP,86490,657.0,18.42,19.18,4.72,4.26,...,8.30,2.50,23.50,2.90,82.60,56.60,5.20,0.00,4.20,0.00
421,AK0,Alaska,State,Alaska,02,733391.0,12.98,24.46,6.56,6.79,...,10.09,3.75,23.38,11.70,48.71,33.21,9.83,2.41,10.91,3.51
422,US0,United States,Nation,United States,1,331449281.0,16.83,22.06,5.55,18.73,...,16.18,6.72,28.69,13.15,49.04,32.06,11.40,1.72,9.64,1.64
423,AK103,Eagle River,Census tract,"Census Tract 2.01, Census Tract 2.02, Census Tract 2.04, Census Tract 2.05, Census Tract 2.06","020000201, 020000202, 020000204, 020000205, 020000206",25118.0,10.37,27.03,6.96,7.84,...,8.00,0.00,15.35,0.00,38.88,4.70,11.88,0.00,7.64,0.00


In [13]:
# compare the original one-to-many results with aggregated results
dups = results_df[results_df.duplicated(subset="id")]["id"].unique().tolist()
# save to CSV for manual QC
results_df[results_df["id"].isin(dups)].to_csv(
    "qc/unaggregated_results.csv", index=False
)
results_df[results_df["id"].isin(dups)]

Unnamed: 0,id,name,areatype,placename,GEOID,total_population,pct_65_plus,pct_under_18,pct_under_5,pct_hispanic_latino,...,pct_unemployed,pct_no_bband_moe,pct_crowding_moe,pct_hcost_moe,pct_no_hsdiploma_moe,pct_below_150pov_moe,pct_minority_moe,pct_single_parent_moe,pct_unemployed_moe,comment
0,AK103,Eagle River,Census tract,Census Tract 2.01,20000201,4318.0,7.74,26.98,7.5,9.59,...,3.8,10.5,4.0,10.0,3.7,6.1,9.7,1.3,2.3,"Data for this place represent multiple merged census tracts: Census Tract 2.01, Census Tract 2.02, Census Tract 2.04, Census Tract 2.05, and Census Tract 2.06"
1,AK103,Eagle River,Census tract,Census Tract 2.02,20000202,6384.0,12.97,25.5,6.74,7.5,...,1.3,14.3,0.8,7.4,2.9,3.4,7.3,6.2,1.2,"Data for this place represent multiple merged census tracts: Census Tract 2.01, Census Tract 2.02, Census Tract 2.04, Census Tract 2.05, and Census Tract 2.06"
2,AK103,Eagle River,Census tract,Census Tract 2.04,20000204,3582.0,12.65,21.3,4.91,5.3,...,2.8,15.9,3.2,7.4,1.2,4.7,7.6,2.9,2.0,"Data for this place represent multiple merged census tracts: Census Tract 2.01, Census Tract 2.02, Census Tract 2.04, Census Tract 2.05, and Census Tract 2.06"
3,AK103,Eagle River,Census tract,Census Tract 2.05,20000205,7421.0,7.16,30.64,8.09,9.03,...,3.7,13.5,1.2,11.4,1.2,2.5,4.8,2.3,2.5,"Data for this place represent multiple merged census tracts: Census Tract 2.01, Census Tract 2.02, Census Tract 2.04, Census Tract 2.05, and Census Tract 2.06"
4,AK103,Eagle River,Census tract,Census Tract 2.06,20000206,3413.0,13.45,28.13,6.39,6.36,...,2.5,8.6,3.1,6.2,1.3,2.4,8.0,3.7,2.5,"Data for this place represent multiple merged census tracts: Census Tract 2.01, Census Tract 2.02, Census Tract 2.04, Census Tract 2.05, and Census Tract 2.06"
0,AK439,Joint Base Elmendorf-Richardson,Census tract,Census Tract 9801,20980100,4592.0,0.68,25.07,11.78,16.86,...,3.2,12.0,1.5,14.0,1.5,6.8,7.4,1.7,3.2,Data for this place represent multiple merged census tracts: Census Tract 9801 and Census Tract 9802
1,AK439,Joint Base Elmendorf-Richardson,Census tract,Census Tract 9802,20980200,6725.0,2.13,25.25,10.51,17.67,...,6.9,9.7,1.2,9.8,1.7,7.1,6.1,4.1,4.6,Data for this place represent multiple merged census tracts: Census Tract 9801 and Census Tract 9802


In [14]:
# save to CSV for manual QC
aggregated_results_df[aggregated_results_df["id"].isin(dups)].to_csv(
    "qc/aggregated_results.csv", index=False
)
aggregated_results_df[aggregated_results_df["id"].isin(dups)]

Unnamed: 0,id,name,areatype,placename,GEOID,total_population,pct_65_plus,pct_under_18,pct_under_5,pct_hispanic_latino,...,pct_no_hsdiploma_high,pct_no_hsdiploma_low,pct_below_150pov_high,pct_below_150pov_low,pct_minority_high,pct_minority_low,pct_single_parent_high,pct_single_parent_low,pct_unemployed_high,pct_unemployed_low
423,AK103,Eagle River,Census tract,"Census Tract 2.01, Census Tract 2.02, Census Tract 2.04, Census Tract 2.05, Census Tract 2.06","020000201, 020000202, 020000204, 020000205, 020000206",25118.0,10.37,27.03,6.96,7.84,...,8.0,0.0,15.35,0.0,38.88,4.7,11.88,0.0,7.64,0.0
424,AK439,Joint Base Elmendorf-Richardson,Census tract,"Census Tract 9801, Census Tract 9802","020980100, 020980200",11317.0,1.54,25.18,11.03,17.34,...,4.4,0.0,24.68,5.02,54.84,35.66,8.19,0.0,11.0,0.0


In [15]:
# save to CSV
aggregated_results_df.to_csv("tbl/data_to_export.csv", index=False)