In [170]:
import pandas as pd
import numpy as np

## Reading and Merging Data

In [171]:
# read raw data

asr_12 = pd.read_csv("./data/2012.csv")
asr_16 = pd.read_csv("./data/2016.csv")
asr_20 = pd.read_csv("./data/2020.csv")

In [172]:
# get projected data for only 2020
asr_20 = asr_20[asr_20['year']== 2020]
asr_20['year'] = '2020'

# add year
asr_12['year'] = '2012'
asr_16['year'] = '2016'

In [173]:
# rename columns to fit other dataframes for merging
asr_20.columns = ['year', 'FIPS', 'County', 'Age_num', 'Age',
                  'Total', 'Total Male', 'Total Female', 'Anglo Total',
                  'Anglo Male', 'Anglo Female', 'Black Total', 'Black Male',
                  'Black Female', 'Hispanic Total', 'Hispanic Male', 'Hispanic Female',
                  'Asian Total', 'Asian Male', 'Asian Female', 'Other Total',
                  'Other Male', 'Other Female']

We must combine Asian demographic data with the Other category in asr_20 because the other dataframes do not have data on Asian populations.

In [174]:
asr_20['Other Total'] = asr_20['Other Total'] + asr_20['Asian Total']
asr_20['Other Male'] = asr_20['Other Male'] + asr_20['Asian Male']
asr_20['Other Female'] = asr_20['Other Female'] + asr_20['Asian Female']

In [175]:
asr_20 = asr_20.drop(columns = ['Age_num', 'Asian Total', 'Asian Male', 'Asian Female'])

In [176]:
# combine dataframes
asr_all = pd.concat([asr_12, asr_16, asr_20], axis = 0, ignore_index = True)

## Data Cleaning and Reformatting

In [177]:
# change column name formatting to convention (lowercase and underscores)
cols = [col.lower() for col in asr_all.columns]
cols = [col.replace(" ", "_") for col in cols]
asr_all.columns = cols

# change county info to lower case for easier index merging
asr_all['county'] = asr_all['county'].apply(lambda x: x.lower())

# remove unnecessary columns - fips
asr_all.drop(columns = ['fips'], inplace = True)

# remove total state population data
mask_all_tx = asr_all['county'] == "state of texas"
asr_all = asr_all[~mask_all_tx]

# remove all age total population data
asr_all = asr_all[asr_all['age'] != 'All Ages']

In [178]:
def get_age_number(age_str):
    """
    Extract integer age number n from string with partial format "n year(s)" or
    "n yr(s)"
    
    Input
    age_str - string - to extract integer age number from
    
    Output
    int age number from age_str or NaN if not found
    """
    words = age_str.split()
    for i, word in enumerate(words):
        if ("year" in word.lower()) or ("yr" in word.lower()):
            try:
                return int(words[i-1])
            except:
                return int(words[i-1].split("+")[0])
    return np.nan

In [179]:
asr_all['age'] = asr_all['age'].apply(get_age_number)

voting_age = 18
asr_all = asr_all[asr_all['age'] >= voting_age]

In [180]:
def assign_age_group(age, start_age = 18, increment_yr = 10):
    """
    """
    bucket = (age - start_age) // increment_yr
    return f"{start_age + (increment_yr * bucket)} to {start_age + (increment_yr * (bucket+1))-1}"

In [181]:
asr_all['age_group'] = asr_all['age'].apply(assign_age_group)

In [182]:
try:
    asr_all.drop(columns = 'age', inplace = True)
except:
    pass
grouped = asr_all.groupby(['county', 'year', 'age_group']).sum()
grouped = grouped.unstack(level = -2)
grouped = grouped.unstack(level = -1).reset_index()
grouped

Unnamed: 0_level_0,county,total,total,total,total,total,total,total,total,total,...,hispanic_female,hispanic_female,hispanic_female,hispanic_female,hispanic_female,hispanic_female,hispanic_female,hispanic_female,hispanic_female,hispanic_female
year,Unnamed: 1_level_1,2012,2012,2012,2012,2012,2012,2012,2012,2016,...,2016,2016,2020,2020,2020,2020,2020,2020,2020,2020
age_group,Unnamed: 1_level_2,18 to 27,28 to 37,38 to 47,48 to 57,58 to 67,68 to 77,78 to 87,88 to 97,18 to 27,...,78 to 87,88 to 97,18 to 27,28 to 37,38 to 47,48 to 57,58 to 67,68 to 77,78 to 87,88 to 97
0,anderson county,7387.0,9358.0,9419.0,8915.0,6285.0,3832.0,2313.0,,7062.0,...,36.0,,633.0,466.0,460.0,360.0,207.0,153.0,40.0,20.0
1,andrews county,2209.0,2050.0,1909.0,2251.0,1473.0,884.0,621.0,,2450.0,...,99.0,,1054.0,1131.0,809.0,541.0,362.0,213.0,92.0,30.0
2,angelina county,11619.0,10862.0,11243.0,11881.0,9576.0,6062.0,4098.0,,12110.0,...,108.0,,1654.0,1335.0,1337.0,971.0,666.0,342.0,123.0,30.0
3,aransas county,2353.0,2019.0,2457.0,3557.0,3908.0,3248.0,1753.0,,2458.0,...,130.0,,621.0,596.0,454.0,410.0,362.0,243.0,121.0,31.0
4,archer county,1022.0,832.0,1156.0,1587.0,1111.0,776.0,462.0,,1213.0,...,11.0,,67.0,49.0,42.0,43.0,25.0,7.0,12.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,wood county,4415.0,3698.0,4444.0,5854.0,6627.0,5734.0,3067.0,,4751.0,...,35.0,,370.0,304.0,242.0,188.0,181.0,65.0,49.0,12.0
250,yoakum county,1040.0,1005.0,941.0,1070.0,746.0,457.0,309.0,,1184.0,...,52.0,,465.0,436.0,309.0,309.0,218.0,134.0,71.0,20.0
251,young county,2122.0,1982.0,2185.0,2796.0,2360.0,1601.0,1333.0,,2105.0,...,27.0,,190.0,296.0,219.0,172.0,163.0,56.0,27.0,13.0
252,zapata county,2287.0,1850.0,1614.0,1429.0,1182.0,791.0,471.0,,2208.0,...,178.0,,951.0,940.0,838.0,757.0,604.0,489.0,179.0,50.0


In [184]:
grouped.columns = ["_".join(col_tuple) for col_tuple in list(grouped.columns)]

In [187]:
grouped.rename(columns = {'county__' : 'county'}, inplace = True)

In [188]:
grouped.head()

Unnamed: 0,county,total_2012_18 to 27,total_2012_28 to 37,total_2012_38 to 47,total_2012_48 to 57,total_2012_58 to 67,total_2012_68 to 77,total_2012_78 to 87,total_2012_88 to 97,total_2016_18 to 27,...,hispanic_female_2016_78 to 87,hispanic_female_2016_88 to 97,hispanic_female_2020_18 to 27,hispanic_female_2020_28 to 37,hispanic_female_2020_38 to 47,hispanic_female_2020_48 to 57,hispanic_female_2020_58 to 67,hispanic_female_2020_68 to 77,hispanic_female_2020_78 to 87,hispanic_female_2020_88 to 97
0,anderson county,7387.0,9358.0,9419.0,8915.0,6285.0,3832.0,2313.0,,7062.0,...,36.0,,633.0,466.0,460.0,360.0,207.0,153.0,40.0,20.0
1,andrews county,2209.0,2050.0,1909.0,2251.0,1473.0,884.0,621.0,,2450.0,...,99.0,,1054.0,1131.0,809.0,541.0,362.0,213.0,92.0,30.0
2,angelina county,11619.0,10862.0,11243.0,11881.0,9576.0,6062.0,4098.0,,12110.0,...,108.0,,1654.0,1335.0,1337.0,971.0,666.0,342.0,123.0,30.0
3,aransas county,2353.0,2019.0,2457.0,3557.0,3908.0,3248.0,1753.0,,2458.0,...,130.0,,621.0,596.0,454.0,410.0,362.0,243.0,121.0,31.0
4,archer county,1022.0,832.0,1156.0,1587.0,1111.0,776.0,462.0,,1213.0,...,11.0,,67.0,49.0,42.0,43.0,25.0,7.0,12.0,5.0


In [189]:
grouped.to_csv("./data/asr_12_16_20.csv", index = False)