In [1]:
import pandas as pd
import numpy as np
import censusdata
import re
import us
from sqlalchemy import create_engine
from tqdm import tqdm
import ohio.ext.pandas
import yaml
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.precision', 2)

### Load secrets for database and census api key

In [2]:
import yaml

with open('secrets.yaml', 'r') as f:
  # loads contents of secrets.yaml into a python dictionary
  secret_config = yaml.safe_load(f.read())

In [3]:
# secret_config

In [4]:
### Set db engine
engine = create_engine("postgresql://{user}:{pw}@{host}:{port}/{db}"
                       .format(user=secret_config['db']['user'], pw=secret_config['db']['password'],
                               db=secret_config['db']['dbname'], host=secret_config['db']['host'],
                               port=secret_config['db']['port']), pool_pre_ping=True)

## Since we focus on FL turnout, we first find FL fips

In [5]:
### Check FL fips
state_fips = us.states.FL.fips
state_fips 

'12'

In [6]:
# censusdata.search('acs5', 2015, 'label', 'unemploy')[160:170]
# censusdata.search('acs5', 2018, 'label', 'household income')

## Search for all county fips in FL

In [7]:
### A function to parse county fips from censusgeo object
def get_county_fips(s):
    state = str(s).split(',')[-1].split(':')[-1]
    return state

In [8]:
tmp_county_in_state_dict = censusdata.geographies(censusdata.censusgeo([('state', state_fips), ('county', '*')]),
                                                  'acs5', 2018, key=secret_config['web_resource']['api_key'])

In [9]:
tmp_county_in_state_dict = pd.Series(tmp_county_in_state_dict)
tmp_county_in_state_dict = pd.DataFrame(tmp_county_in_state_dict)
# tmp_county_in_state_dict['geo_tuple'] = tmp_county_in_state_dict[0].map(geo_info)

In [10]:
tmp_county_in_state_dict['State_fips'] = state_fips

In [11]:
tmp_county_in_state_dict['County_fips'] = tmp_county_in_state_dict[0].map(get_county_fips)

In [12]:
tmp_county_in_state_dict.drop(columns=[0], inplace=True)

In [13]:
### Check to see if we make things right
tmp_county_in_state_dict.head()

Unnamed: 0,State_fips,County_fips
"Okaloosa County, Florida",12,91
"Taylor County, Florida",12,123
"Washington County, Florida",12,133
"Duval County, Florida",12,31
"Bradford County, Florida",12,7


## Get corresponding info for each block group

### A few funtions to extract fips, county and state name from censusgeo object

In [14]:
### Extract fips from censusgeo object
def extract_fips(s):
    code_str = str(s).split(',')[-1]
    return int(''.join([s for s in code_str if s.isdigit()]))

In [15]:
### Extract county and state name from censusgeo object
def geo_info(s):
    county, state = str(s).split(':')[0].split(',')[-2:]
    county = county.strip()
    state = state.strip()
    return (county, state)

### First we create a list of what attribute we would like to collect
Those uniqueid are searched through https://www2.census.gov/programs-surveys/acs/summary_file/2018/documentation/user_tools/ACS2018_Table_Shells.xlsx

In [16]:
### Unique id for sex by age
tmp_list_for_sex_by_age = []
for i in range(49):
    tmp_column_name = "B01001_" + str(i+1).zfill(3) +'E'
    tmp_list_for_sex_by_age.append(tmp_column_name)
tmp_list_for_sex_by_age[0:5]

['B01001_001E', 'B01001_002E', 'B01001_003E', 'B01001_004E', 'B01001_005E']

In [17]:
### Unique id for total population
tmp_list_for_total_population = ['B01003_001E']

In [18]:
### Unique id for race
tmp_list_for_race = []
for i in range(9):
    tmp_column_name = "B02001_" + str(i+1).zfill(3) +'E'
    tmp_list_for_race.append(tmp_column_name)
tmp_list_for_race[0:5]

['B02001_001E', 'B02001_002E', 'B02001_003E', 'B02001_004E', 'B02001_005E']

In [19]:
### Unique id for insurance by age
tmp_list_for_insurance_by_age = []
for i in range(66):
    tmp_column_name = "B27010_" + str(i+1).zfill(3) +'E'
    tmp_list_for_insurance_by_age.append(tmp_column_name)
tmp_list_for_insurance_by_age[0:5]

['B27010_001E', 'B27010_002E', 'B27010_003E', 'B27010_004E', 'B27010_005E']

In [20]:
### Unique id for education
tmp_list_for_education = []
for i in range(25):
    tmp_column_name = "B15003_" + str(i+1).zfill(3) +'E'
    tmp_list_for_education.append(tmp_column_name)
tmp_list_for_education[0:5]

['B15003_001E', 'B15003_002E', 'B15003_003E', 'B15003_004E', 'B15003_005E']

In [21]:
### Unique id for employment status
tmp_list_for_employment = []
for i in range(7):
    tmp_column_name = "B23025_" + str(i+1).zfill(3) +'E'
    tmp_list_for_employment.append(tmp_column_name)
tmp_list_for_employment[0:5]

['B23025_001E', 'B23025_002E', 'B23025_003E', 'B23025_004E', 'B23025_005E']

In [29]:
### Unique id for latino
tmp_list_for_latino = []
for i in range(3):
    tmp_column_name = "B03003_" + str(i+1).zfill(3) +'E'
    tmp_list_for_latino.append(tmp_column_name)
tmp_list_for_latino

['B03003_001E', 'B03003_002E', 'B03003_003E']

In [30]:
### Unique id for household income
tmp_list_for_household_income = []
for i in range(17):
    tmp_column_name = "B19001_" + str(i+1).zfill(3) +'E'
    tmp_list_for_household_income.append(tmp_column_name)
tmp_list_for_household_income[0:5]

['B19001_001E', 'B19001_002E', 'B19001_003E', 'B19001_004E', 'B19001_005E']

### Make a list of variables we would like to find

In [31]:
list_of_attribute_to_collect = ['sex_by_age','total_population', 'race', 
                                'insurance_by_age', 'education', 'employment','latino',
                                'household_income']

In [32]:
# cookbg = censusdata.download('acs5', 2018,
#                              censusdata.censusgeo([('state', state_fips), ('county', '091'), ('block group', '*')]),
#                              tmp_list_for_placeofbirth_by_sex, key=secret_config['web_resource']['api_key'])

# cookbg.head()

In [33]:
len(list_of_attribute_to_collect)

8

### Then we go over this list of attribute we would like to find

In [None]:
for j in tqdm(range(len(list_of_attribute_to_collect))):
    ### First set which attribute we would like to find
    tmp_attribute_to_collect = list_of_attribute_to_collect[j]

    if tmp_attribute_to_collect == 'sex_by_age':
        tmp_list_of_columns = tmp_list_for_sex_by_age
    if tmp_attribute_to_collect == 'total_population':
        tmp_list_of_columns = tmp_list_for_total_population
    if tmp_attribute_to_collect == 'race':
        tmp_list_of_columns = tmp_list_for_race
    if tmp_attribute_to_collect == 'insurance_by_age':
        tmp_list_of_columns = tmp_list_for_insurance_by_age
    if tmp_attribute_to_collect == 'education':
        tmp_list_of_columns = tmp_list_for_education
    if tmp_attribute_to_collect == 'employment':
        tmp_list_of_columns = tmp_list_for_employment
    if tmp_attribute_to_collect == 'latino':
        tmp_list_of_columns = tmp_list_for_latino
    if tmp_attribute_to_collect == 'household_income':
        tmp_list_of_columns = tmp_list_for_household_income
        
        
    #### Then go over each county in specific state to find this attribute
    for i in tqdm(range(tmp_county_in_state_dict.shape[0])):

        ### First we get county fips
        tmp_county_fips = tmp_county_in_state_dict['County_fips'][i]

        ### cookbg is a df which contains corrsponding info in each county
        cookbg = censusdata.download('acs5', 2018,
                                     censusdata.censusgeo([('state', state_fips), ('county', tmp_county_fips), ('block group', '*')]),
                                     tmp_list_of_columns, key=secret_config['web_resource']['api_key'])

        cookbg['geo_tuple'] = cookbg.index.map(geo_info)
        cookbg['fips'] = cookbg.index.map(extract_fips)

        cookbg['Block_fips'] = cookbg['fips'].map(lambda x:str(x)[-1])
        cookbg['Tract_fips'] = cookbg['fips'].map(lambda x:str(x)[5:-1])
        cookbg['County_fips'] = tmp_county_fips
        cookbg['State_fips'] = state_fips
        cookbg['County_Name'] = cookbg['geo_tuple'].map(lambda x:x[0])
        cookbg['State_Name'] = cookbg['geo_tuple'].map(lambda x:x[1])

        cookbg.set_index("fips", inplace=True)

        if i==0:
            df_for_all_county_in_state = cookbg
        else:
            df_for_all_county_in_state = pd.concat([df_for_all_county_in_state, cookbg])


    ### Some cleaning
    df_for_all_county_in_state.drop(columns=['geo_tuple'], inplace=True)
    df_for_all_county_in_state.columns = df_for_all_county_in_state.columns.str.lower()
    df_for_all_county_in_state = df_for_all_county_in_state.reset_index()
    # df_for_all_county_in_state.head()

    ### Use ohio to output df to specifc schema in given database
    if tmp_attribute_to_collect == 'sex_by_age':
        df_for_all_county_in_state.pg_copy_to('acs_sex_by_age_b01001', schema=secret_config['db']['schema'], con=engine)
    if tmp_attribute_to_collect == 'total_population':
        df_for_all_county_in_state.pg_copy_to('acs_total_population_b01003', schema=secret_config['db']['schema'], con=engine)
    if tmp_attribute_to_collect == 'race':   
        df_for_all_county_in_state.pg_copy_to('acs_race_b02001', schema=secret_config['db']['schema'], con=engine)
    if tmp_attribute_to_collect == 'insurance_by_age':
        df_for_all_county_in_state.pg_copy_to('acs_insurance_by_age_b27010', schema=secret_config['db']['schema'], con=engine)

    if tmp_attribute_to_collect == 'education':
        df_for_all_county_in_state.pg_copy_to('acs_education_b15003', schema=secret_config['db']['schema'], con=engine)
    if tmp_attribute_to_collect == 'employment':
        df_for_all_county_in_state.pg_copy_to('acs_employment_b23025', schema=secret_config['db']['schema'], con=engine)
    if tmp_attribute_to_collect == 'latino':
        df_for_all_county_in_state.pg_copy_to('acs_latino_b03003', schema=secret_config['db']['schema'], con=engine)
    if tmp_attribute_to_collect == 'household_income':
        df_for_all_county_in_state.pg_copy_to('acs_household_income_b19001', schema=secret_config['db']['schema'], con=engine)