In [57]:
from dotenv import load_dotenv
import os
import requests
import pandas as pd
import re
import numpy as np

load_dotenv()
census_key = os.getenv("CENSUS_API_KEY")

Call Census ACS5 API for (var) at Tract Level
- Total Population (B01003_001E)
- Uninsured Populatoin Segment (B27001_004E)
- Median household income (B19013_001E)

In [58]:
YEAR = 2019
base_url = f"https://api.census.gov/data/{YEAR}/acs/acs5"

# call to census ACS5 2019 API at tract level
def call(state, county, tract):
    if tract == 'ALL':
        for_val = 'tract:*'
    else:
        for_val = f'tract:{tract}'
    params = {
        "get": "B01003_001E,B27001_004E,B19013_001E,NAME",
        "for": for_val, 
        "in": f"state:{state}+county:{county}",
        "key": census_key
    }

    response = requests.get(base_url, params)

    print(response)

    if response.status_code == 200:
        data = response.json()
    else:
        print("Error:", response.status_code)
    
    return data

# return a tuple (state, county, tract) from an 11-digit FIPS
# does not check for validity
def code_to_tuple(census_tract: str) -> tuple[str]:
    ids = []
    ids = re.match(r'(\d{2})(\d{3})(\d{6})', census_tract)
    tup = (ids.group(1), ids.group(2), ids.group(3))
    return tup


def get_pop_batched(census_tract: str):
    tup = code_to_tuple(census_tract)
    data = call(tup[0], tup[1], 'ALL')
    acs_data = pd.DataFrame(data)
    return data

# All data for california
def california_call():
    params = {
        "get": "B01003_001E,B27001_004E,B19013_001E,NAME",
        "for": 'tract:*', 
        "in": f"state:06+county:*",
        "key": census_key
    }

    response = requests.get(base_url, params)

    if response.status_code == 200:
        data = response.json()
    else: 
        print("Error:", response.status_code)
    
    return data


In [59]:

acs_data = california_call()

acs_df = pd.DataFrame(data=acs_data[1:], columns = acs_data[0])

acs_df.rename(columns={
    "B01003_001E": "TotalPop",
    "B27001_004E": "UninsuredPop",
    "B19013_001E": "MedHouseInc"
    }, inplace=True)

acs_df['GEOID'] = acs_df['state'] + acs_df['county'] + acs_df['tract']
acs_df.drop(columns=['state', 'county', 'tract', 'NAME'], inplace=True)

acs_df.head(3)


Unnamed: 0,TotalPop,UninsuredPop,MedHouseInc,GEOID
0,3195,39,206607,6085507904
1,8604,537,114300,6085508504
2,4871,218,152969,6085508505


Remove Census Null value marker from Median House Income 

In [55]:
acs_df.MedHouseInc.replace("-666666666", None, inplace=True)

In [56]:
acs_df.shape
acs_df.to_csv('../data/raw/ACS5 2019.csv')

Split USDA into 1/2, 1, 10, 20

Thoughts:
- is there real reason to, if I want to regress on this data later, it would work against me to have them in different datasets
- The healthoutcomes for a tract take into account all persons, not just those who are farther away from food sources
- Do this in the exploratory Analysis, for this phase just worry about collecting data, that means extracting tract data and refering to the API for corresponding population estimates
- Another example could be get the fara for another year and see if there's a correlation between more or less restricted food access and the progression of health outcomes with time ???
