In [1]:
from dotenv import dotenv_values
import requests
import pandas as pd
import ast
from tqdm.notebook import tqdm

In [2]:
# OrderedDict containing key-value pairs of secret keys
config = dotenv_values("../.env")
CENSUS_API_KEY = config['CENSUS_API_KEY']

In [3]:
def get_variable_table_df(year):
    variable_table_url = f'https://api.census.gov/data/{year}/acs/acs1/variables.html'
    v_table = pd.read_html(variable_table_url)
    variable_df = pd.DataFrame(v_table[0])
    variable_df['Label'].replace({"!!": " ", ":": ""}, regex=True, inplace=True)
    return variable_df

In [4]:
def get_male_by_age_index(variable_table):
    start_index = variable_table[variable_table['Label'] == 'Estimate Total Male'].index[0]
    end_index = variable_table[variable_table['Label'] == 'Estimate Total Male 85 years and over'].index[0]
    return start_index, end_index + 1

In [5]:
def get_variable_names(variable_table, indeces):
    total_male_by_age_variables = ",".join(variable_table.iloc[indeces[0]: indeces[1]]['Name'].values)
    return total_male_by_age_variables

In [15]:
def get_query_url(year, variables):
    # API Reference: https://www.census.gov/data/developers/guidance/api-user-guide.Example_API_Queries.html
    # Data Dictionary: https://api.census.gov/data.html
    host = 'https://api.census.gov/data'
    year = f'/{year}'
    dataset_acronym = '/acs/acs1'
    g = '?get='
    location = '&for=state:*'
    usr_key = f"&key={CENSUS_API_KEY}"
    query_url = f"{host}{year}{dataset_acronym}{g}{variables}{location}{usr_key}"
    return query_url

In [16]:
def get_query_text(query_url):
    response = requests.get(query_url)
    return response.text

In [18]:
def get_values_from_response(response_text):
    values = [int(i) for i in ast.literal_eval(response_text)[1]]
    return values

In [19]:
def get_labels(variable_df, indeces):
    labels = [i.replace("!!", " ").replace(":", "") for i in variable_df.iloc[indeces[0]:indeces[1]]['Label'].values]
    return labels

In [20]:
def create_year_pop_dataframe(year, labels, values):
    df = pd.DataFrame({year: {labels[i]: values[i] for i in range(len(labels))}}).reindex(labels)
    return df

In [31]:
def create_male_pop_by_age_df(year):
    v_table = get_variable_table_df(year)
    male_by_age_indeces = get_male_by_age_index(v_table)
    variables = get_variable_names(v_table, male_by_age_indeces)
    query_url = get_query_url(year, variables)
    response_text = get_query_text(query_url)
    vals = get_values_from_response(response_text)
    labels = get_labels(v_table, male_by_age_indeces)
    df = create_year_pop_dataframe(year, labels, vals)
    return df

In [32]:
create_male_pop_by_age_df(2019)

Unnamed: 0,2019
Estimate Total Male,6217305
Estimate Total Male Under 5 years,378778
Estimate Total Male 5 to 9 years,383859
Estimate Total Male 10 to 14 years,420715
Estimate Total Male 15 to 17 years,249124
Estimate Total Male 18 and 19 years,168973
Estimate Total Male 20 years,84744
Estimate Total Male 21 years,86507
Estimate Total Male 22 to 24 years,252191
Estimate Total Male 25 to 29 years,450372


In [22]:
years = [i for i in range(2018, 2020)]
male_pop_by_age_df = pd.DataFrame(columns=['Population Label'])
for year in tqdm(years):
    try:
        y_df = create_male_pop_by_age_df(year).reset_index().rename({'index': 'Population Label'}, axis=1)
        male_pop_by_age_df = pd.merge(male_pop_by_age_df, y_df, how='outer', on='Population Label')
    except IndexError:
        next

  0%|          | 0/2 [00:00<?, ?it/s]

In [23]:
male_pop_by_age_df

Unnamed: 0,Population Label,2018,2019
0,Estimate Total Male,2369271,6217305
1,Estimate Total Male Under 5 years,149436,378778
2,Estimate Total Male 5 to 9 years,151958,383859
3,Estimate Total Male 10 to 14 years,164696,420715
4,Estimate Total Male 15 to 17 years,97945,249124
5,Estimate Total Male 18 and 19 years,72354,168973
6,Estimate Total Male 20 years,38027,84744
7,Estimate Total Male 21 years,35913,86507
8,Estimate Total Male 22 to 24 years,87578,252191
9,Estimate Total Male 25 to 29 years,162850,450372


In [13]:
male_pop_by_age_df.to_csv('data_extract/male_pop_by_age_2005-2019.csv')