In [1]:
from dotenv import dotenv_values
import requests
import pandas as pd
import ast
from tqdm.notebook import tqdm
from collections import OrderedDict

In [2]:
# OrderedDict containing key-value pairs of secret keys
config = dotenv_values("../.env")
CENSUS_API_KEY = config['CENSUS_API_KEY']

In [3]:
def get_variable_table_df(year):
    variable_table_url = f'https://api.census.gov/data/{year}/acs/acs1/profile/variables.html'
    v_table = pd.read_html(variable_table_url)
    variable_df = pd.DataFrame(v_table[0])
    variable_df['Label'].replace({"!!": " ", ":": ""}, regex=True, inplace=True)
    return variable_df

In [4]:
def get_male_by_age_index(variable_table):
    start_index = variable_table[variable_table['Label'] == 'Estimate Total Male'].index[0]
    end_index = variable_table[variable_table['Label'] == 'Estimate Total Male 85 years and over'].index[0]
    return start_index, end_index + 1

In [5]:
def get_variable_names(variable_table, indeces):
    total_male_by_age_variables = ",".join(variable_table.iloc[indeces[0]: indeces[1]]['Name'].values)
    return total_male_by_age_variables

In [6]:
def get_query_url(year, variables, state_code="*"):
    # API Reference: https://www.census.gov/data/developers/guidance/api-user-guide.Example_API_Queries.html
    # Data Dictionary: https://api.census.gov/data.html
    host = 'https://api.census.gov/data'
    year = f'/{year}'
    dataset_acronym = '/acs/acs1/profile'
    g = '?get='
    location = f"&for=county:*&in=state:{state_code}"
    usr_key = f"&key={CENSUS_API_KEY}"
    query_url = f"{host}{year}{dataset_acronym}{g}{variables}{location}{usr_key}"
    return query_url

In [7]:
def get_query_text(query_url):
    response = requests.get(query_url)
    return response.text

In [8]:
def get_values_from_response(response_text):
    values = [int(i) for i in ast.literal_eval(response_text)[1]]
    return values

In [9]:
def get_labels(variable_df, indeces):
    labels = [i.replace("!!", " ").replace(":", "") for i in variable_df.iloc[indeces[0]:indeces[1]]['Label'].values]
    return labels

In [10]:
def create_year_pop_dataframe(year, labels, values):
    df = pd.DataFrame({year: {labels[i]: values[i] for i in range(len(labels))}}).reindex(labels)
    return df

In [11]:
variable_dict = OrderedDict()
variable_dict["NAME"] = "County Name"
variable_dict["DP02_0058PE"] = "Percent_EduAttain_25y_over"
variable_dict["DP02_0088PE"] = "Percent_BornInUS"
variable_dict["DP03_0002PE"] = "Percent_InLaborForce"
variable_dict["DP03_0015PE"] = "Percent_AllParentsInLaborForce"

In [12]:
def get_data(year, state_code, variables):
    v_table = get_variable_table_df(year)
    query_url = get_query_url(year, variables=variables, state_code=state_code)
    response_text = get_query_text(query_url).replace("null", "None")
    response_data = eval(response_text)
    df = pd.DataFrame(response_data[1:], columns = response_data[0])
    df = df.astype({col: float for col in df.columns[1:]})
    # vals = get_values_from_response(response_text)
    # labels = get_labels(v_table, male_by_age_indeces)
    # df = create_year_pop_dataframe(year, labels, vals)
    for col in df.columns:
        if col in variable_dict.keys():
            df = df.rename({col:variable_dict[col]}, axis=1)
    return df

In [13]:
list(variable_dict.values())

['County Name',
 'Percent_EduAttain_25y_over',
 'Percent_BornInUS',
 'Percent_InLaborForce',
 'Percent_AllParentsInLaborForce']

In [14]:
# get data for counties across multiple states in question
years = [2011]
# California, Oregon, Arizona, Nevada
# state_codes = ["06", "41", "04", "32"]
state_codes = ["06", "41", "04", "32"]

output = pd.DataFrame(columns=list(variable_dict.values()) + ["state", "county"])
for year in years:
    for state_code in state_codes:
        test = get_data(year, state_code, "NAME,DP02_0058PE,DP02_0088PE,DP03_0002PE,DP03_0015PE")
        output = pd.concat([output, test], axis=0)


In [15]:
output

Unnamed: 0,County Name,Percent_EduAttain_25y_over,Percent_BornInUS,Percent_InLaborForce,Percent_AllParentsInLaborForce,state,county
0,"Alameda County, California",1039030.0,67.7,65.9,64.5,6.0,1.0
1,"Butte County, California",140947.0,91.3,55.8,59.5,6.0,7.0
2,"Contra Costa County, California",716292.0,75.5,65.4,63.5,6.0,13.0
3,"El Dorado County, California",126849.0,89.4,62.8,62.7,6.0,17.0
4,"Fresno County, California",555181.0,76.6,61.4,59.3,6.0,19.0
...,...,...,...,...,...,...,...
7,"Pinal County, Arizona",250514.0,89.4,49.6,52.0,4.0,21.0
8,"Yavapai County, Arizona",157899.0,92.4,50.5,59.0,4.0,25.0
9,"Yuma County, Arizona",122782.0,74.7,52.0,61.8,4.0,27.0
0,"Clark County, Nevada",1300342.0,76.5,65.9,63.9,32.0,3.0


In [16]:
output.to_csv("data/demographic_data.csv")