In [17]:
from dotenv import dotenv_values
import requests
import pandas as pd
import ast
from tqdm.notebook import tqdm
from collections import OrderedDict

In [18]:
# OrderedDict containing key-value pairs of secret keys
config = dotenv_values("../.env")
CENSUS_API_KEY = config['CENSUS_API_KEY']

In [19]:
def get_variable_table_df(year):
    variable_table_url = f'https://api.census.gov/data/{year}/acs/acs1/profile/variables.html'
    v_table = pd.read_html(variable_table_url)
    variable_df = pd.DataFrame(v_table[0])
    variable_df['Label'].replace({"!!": " ", ":": ""}, regex=True, inplace=True)
    return variable_df

In [21]:
def get_query_url(year, variables, state_code="*"):
    # API Reference: https://www.census.gov/data/developers/guidance/api-user-guide.Example_API_Queries.html
    # Data Dictionary: https://api.census.gov/data.html
    host = 'https://api.census.gov/data'
    year = f'/{year}'
    dataset_acronym = '/acs/acs1/profile'
    g = '?get='
    location = f"&for=county:*&in=state:{state_code}"
    usr_key = f"&key={CENSUS_API_KEY}"
    query_url = f"{host}{year}{dataset_acronym}{g}{variables}{location}{usr_key}"
    return query_url

In [22]:
def get_query_text(query_url):
    response = requests.get(query_url)
    return response.text

In [23]:
def get_values_from_response(response_text):
    values = [int(i) for i in ast.literal_eval(response_text)[1]]
    return values

In [24]:
def get_labels(variable_df, indeces):
    labels = [i.replace("!!", " ").replace(":", "") for i in variable_df.iloc[indeces[0]:indeces[1]]['Label'].values]
    return labels

In [25]:
variable_dict = OrderedDict()
variable_dict["NAME"] = "County Name"
variable_dict["DP02_0058PE"] = "Percent_EduAttain_25y_over"
variable_dict["DP02_0088PE"] = "Percent_BornInUS"
variable_dict["DP03_0002PE"] = "Percent_InLaborForce"
variable_dict["DP03_0015PE"] = "Percent_AllParentsInLaborForce"

In [12]:
def get_data(year, state_code, variables):
    query_url = get_query_url(year, variables=variables, state_code=state_code)
    response_text = get_query_text(query_url).replace("null", "None")
    response_data = eval(response_text)
    df = pd.DataFrame(response_data[1:], columns = response_data[0])
    df = df.astype({col: float for col in df.columns[1:]})
    # vals = get_values_from_response(response_text)
    # labels = get_labels(v_table, male_by_age_indeces)
    # df = create_year_pop_dataframe(year, labels, vals)
    for col in df.columns:
        if col in variable_dict.keys():
            df = df.rename({col:variable_dict[col]}, axis=1)
    return df

In [13]:
list(variable_dict.values())

['County Name',
 'Percent_EduAttain_25y_over',
 'Percent_BornInUS',
 'Percent_InLaborForce',
 'Percent_AllParentsInLaborForce']

In [30]:
# Convenient mapping between names and codes for states and counties
merged_index = pd.read_csv("merged_index.csv", index_col=[0])

In [32]:
merged_index

Unnamed: 0,fips,name,state,stname,st
0,1001,Autauga County,AL,Alabama,1
1,1003,Baldwin County,AL,Alabama,1
2,1005,Barbour County,AL,Alabama,1
3,1007,Bibb County,AL,Alabama,1
4,1009,Blount County,AL,Alabama,1
...,...,...,...,...,...
3138,56037,Sweetwater County,WY,Wyoming,56
3139,56039,Teton County,WY,Wyoming,56
3140,56041,Uinta County,WY,Wyoming,56
3141,56043,Washakie County,WY,Wyoming,56


In [35]:
def get_statecodes_from_acronyms(acronyms):
    statecodes = list(merged_index[merged_index.state.isin(acronyms)].st.value_counts().keys())
    return statecodes

[48, 13, 17, 37, 39, 26, 12, 42, 36, 6, 41, 32, 4]

In [49]:
# get data for counties across multiple states in question
years = [2011]
state_codes = get_statecodes_from_acronyms(['CA', 'OR', 'AZ', 'NV', 'TX', 'FL', 'NY', 'PA', 'IL', 'OH', 'GA', 'NC', 'MI'])

output = pd.DataFrame(columns=list(variable_dict.values()) + ["state", "county"])
for year in years:
    for state_code in state_codes:
        if state_code < 10:
            state_code = f"0{state_code}"
        test = get_data(year, state_code, "NAME,DP02_0058PE,DP02_0088PE,DP03_0002PE,DP03_0015PE")
        output = pd.concat([output, test], axis=0)


State Code: 48
Counties: 53
State Code: 13
Counties: 35
State Code: 17
Counties: 23
State Code: 37
Counties: 39
State Code: 39
Counties: 38
State Code: 26
Counties: 29
State Code: 12
Counties: 40
State Code: 42
Counties: 40
State Code: 36
Counties: 39
State Code: 06
Counties: 40
State Code: 41
Counties: 15
State Code: 32
Counties: 2
State Code: 04
Counties: 10


In [50]:
output

Unnamed: 0,County Name,Percent_EduAttain_25y_over,Percent_BornInUS,Percent_InLaborForce,Percent_AllParentsInLaborForce,state,county
0,"Angelina County, Texas",56029.0,91.2,62.6,55.5,48.0,5.0
1,"Bastrop County, Texas",51810.0,88.1,65.4,56.5,48.0,21.0
2,"Bell County, Texas",187920.0,87.8,67.5,57.8,48.0,27.0
3,"Bexar County, Texas",1091107.0,84.7,64.6,61.7,48.0,29.0
4,"Bowie County, Texas",61351.0,97.2,54.9,68.8,48.0,37.0
...,...,...,...,...,...,...,...
5,"Navajo County, Arizona",64505.0,97.3,51.0,49.4,4.0,17.0
6,"Pima County, Arizona",657636.0,85.5,58.8,61.2,4.0,19.0
7,"Pinal County, Arizona",250514.0,89.4,49.6,52.0,4.0,21.0
8,"Yavapai County, Arizona",157899.0,92.4,50.5,59.0,4.0,25.0


In [51]:
output.to_csv("data/demographic_data.csv")