In [1]:

#The variables come from here:
# https://api.census.gov/data/2019/acs/acs1/variables.html

#importing packages
import pandas as pd
import censusdata
import gc
import re
from unittest.mock import inplace
import json
import os
import itertools
from datetime import date



In [2]:
def get_geoid_reference_df(geographies: dict) -> pd.DataFrame:

    """Curates the GEOID References from the geographies grab"""

    #creating a dictionary to store the GEOID references

    geoid_dict = {
        'GEOID': [],
        'STATE': [],
        'COUNTY': [], 
        'TRACT' : [], 
        'BLOCK_GROUP': []
    }

    # replace unessecary strings with blank
    replace_str = ['Summary level: 150', 'state', 'county', 'tract', 'block group', ' ', ':', ',']

    # iterate throguh items of returned geogeographies from us census api 
    for item in geographies.items():
        # get the second item in the tuple
        geoid = item[1]
        # remove strings in replace_str list in string
        geoid = re.sub('|'.join(replace_str), '', str(geoid))
        # split by > to get seperated census ids 
        geoids = str(geoid).split('>')
        # append to dictionary lists for each census id 
        geoid_dict['STATE'].append(geoids[0])
        geoid_dict['COUNTY'].append(geoids[1])
        geoid_dict['TRACT'].append(geoids[2])
        geoid_dict['BLOCK_GROUP'].append(geoids[3])

        # curate the GEOID by adding census ids together
        geoid_dict['GEOID'].append(geoids[0] + geoids[1] + geoids[2] + geoids[3])

    geoid_df = pd.DataFrame.from_dict(geoid_dict)

    geoid_df.set_index('GEOID', inplace=True)


    return geoid_df

In [3]:
def geoid_from_df(df: pd.DataFrame) -> pd.DataFrame:

    "Return the GEOID from the dataframe that is returned from the ACS calls"


    # reset index so we can handle the census object that is returned for the id column
    df = df.reset_index()

    df = df.rename(columns={"index": "id"})
    
    geoid = df[['id']].astype(str)

    # Get split columns on colon

    geoid = geoid['id'].str.split(":",expand=True)

    # get all of the census id columns from the split
    geoid = geoid[[3,4,5,6]]
    
    # force set the geoid columns by position
    geoid.columns = ['STATE', 'COUNTY', 'TRACT', 'BLOCK_GROUP']
    
    # add all columns together to get GEOID

    geoid['GEOID'] = geoid['STATE'] + geoid['COUNTY'] + geoid['TRACT'] + geoid['BLOCK_GROUP']

    # Apply a replacement to all columns to obtain cleaned geoid 

    geoid['GEOID'] = geoid['GEOID'].apply(lambda s: re.sub('|'.join(['> block group', '> tract', '> county']), '', str(s)))

    # drop the unessecary columns

    geoid = geoid.drop(columns=['STATE', 'COUNTY', 'TRACT', 'BLOCK_GROUP'])
    
    # concat new columns and passed dataframe

    df = pd.concat([df,geoid], axis=1)

    # drop id columns and set GEOID index so concat works

    df.drop(columns=['id'], inplace=True)

    df.reset_index()

    df.set_index('GEOID', inplace=True) 


    return df


In [5]:
# get census boundaries 

def get_acs_data(acs_data_dict: dict, state_code:str, year:str) -> pd.DataFrame:

    """This function will return a dataframe with the ACS data for each chunk"""

    def _chunked(dd, size):
        
        """A generator to break the passed dictionary into chunks that can be iterated over"""

        it = iter(dd)
        while True:
            p = tuple(itertools.islice(it, size))
            if not p:
                break
            yield p

    # get the chunk size from the acs_dd dictionary

    chunks = int(len(list(acs_data_dict.items()))/15)


    # get geoids for pass in state code
    boundaries = censusdata.geographies(censusdata.censusgeo([('state', state_code), ('county', '*'), ('tract', '*'), ('block group', '*')]), 'acs5', year)

    # curate reference and dataframe to concat on as we iterate through the chunks 

    geoid_df = get_geoid_reference_df(boundaries)

    # using reference curate the dataframe and assign the GEOID index to concat on

    census_df_list = []

    # iterate over the dictionary in chunks
    for chunk in _chunked(acs_dd, chunks):

        for var in chunk:
            #Get the census data from the API
            census_data_df = censusdata.download('acs5', year, censusdata.censusgeo([('state', state_code), ('county', '*'), ('tract', '*'), ('block group', '*')]), [var]).rename(columns=acs_dd)
            #Pulling out the tract number from the index
        
            # print(get_geoid(census_data_df).columns.values)

            acs_df = geoid_from_df(census_data_df)

            census_df_list.append(acs_df)
            # acs_df = None

    census_df = pd.concat([geoid_df, pd.concat(census_df_list, axis=1)], axis=1)

   # concat all the dataframes together 
    return census_df
        


Unnamed: 0_level_0,STATE,COUNTY,TRACT,BLOCK_GROUP,Total_Population,Leave_Home5-5:30am,Had_Baby_Last_Year,HS_Diploma,$150-200k_Household_Income,Median_Num_Rooms,Male_30-34,Female_40-44,Female_Never_Married,Male_Local_Gov,Female_>=75_With_Disability,Male_5_17_Cognitive_Difficulty,Female_5_17_Self_Care_Difficulty,Male_55_64_Health_Ins,Male_6_18_Private_Ins,Male_>=75_Public_Ins
GEOID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
371470006022,37,147,000602,2,2344,27,,501,0,3.3,57,151,650,54,,,,,,
371470002013,37,147,000201,3,989,0,,104,9,4.6,63,0,393,0,,,,,,
371470002014,37,147,000201,4,806,0,,64,0,4.5,20,13,302,27,,,,,,
371190060071,37,119,006007,1,4005,109,,835,84,6.5,136,284,713,73,,,,,,
371190060073,37,119,006007,3,2624,0,,225,102,6.2,112,145,302,66,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
370839308001,37,083,930800,1,642,0,,160,6,5.7,21,0,68,0,,,,,,
370839307004,37,083,930700,4,1345,39,,395,40,5.4,30,49,69,0,,,,,,
370839307002,37,083,930700,2,890,7,,199,11,5.2,0,21,59,0,,,,,,
370839309006,37,083,930900,6,981,0,,192,0,5.2,11,0,207,39,,,,,,


In [None]:
# set absolute path to join on for reading and writing
abspath = os.path.dirname(os.path.normpath(os.path.abspath(os.path.dirname(''))))

filename = 'acs_dd.json'

print(abspath)

# read in ACS data to obtain keys and formatted column names from source config folder

acs_dd = json.load(open(os.path.join(abspath, 'src', 'config', filename)))

# get today's date and year for the ACS data

todays_date = date.today()

# year = todays_date.year

year = '2019' # hard coded year to get since 2019 is the only thing that is available 

# run the get_acs_data function to get the ACS data for each chunk within the config folder
raw_df = get_acs_data(acs_dd, '37', year)

# export data to csv on raw path

raw_df.to_csv(os.path.join(abspath, 'data', 'raw', f'acs_{year}_raw.csv'))
