In [1]:
import pandas as pd
import os

In [2]:
'''
Explanation of data variables:

    ACS - American Community Survey 5-year estimates
    DEC - Decennial Census 

    income:    Household Income (Brackets)
        B19001 (ACS 2009 - 2016)
        P052   (DEC 2000)
        
    education: Education Attained (Brackets - Percentages)
        S1501  (ACS 2009 -2016)
        QTP20  (DEC 2000)
        
    race:      Percent White
        B02001 (ACS 2009 - 2016)
        P007   (DEC 2000)
        
    household: Percent Families (approximation for population density)
        B11001 (ACS 2009 - 2016)
        QTP10  (DEC 2000)
        
    rent:      Gross Rent (Brackets)
        B25063 (ACS 2009 - 2016)
        H062   (DEC 2000)

    value:     Owner Occupied Home value
        B25075 (ACS 2009 - 2016)
        H084   (DEC 2000)

    ###not using age###
    age:       Age of Structure (year built)
        B25034 (ACS 2009 - 2016)
        H034   (DEC 2000)
'''

raw_data_dir = os.path.join(os.getcwd(), 'data', 'raw')

empty_vars = {'income': '', 'education': '', 'race': '',
              'household': '', 'rent': '', 'value': ''}

acs_data_vars = {'B19001': 'income', 'S1501': 'education', 'B02001': 'race',
                 'B11001': 'household', 'B25063': 'rent', 'B25075': 'value'}

dec_data_vars = {'P052': 'income', 'QTP20': 'education', 'P007': 'race',
                 'QTP10': 'household', 'H062': 'rent', 'H084': 'value'}

data_dict = {
    '2000': empty_vars.copy(),
    '2009': empty_vars.copy(),
    '2010': empty_vars.copy(),
    '2011': empty_vars.copy(),
    '2012': empty_vars.copy(),
    '2013': empty_vars.copy(),
    '2014': empty_vars.copy(),
    '2015': empty_vars.copy(),
    '2016': empty_vars.copy()
}

for fil in os.listdir(raw_data_dir):
    fil_spl = fil.split('_')
    if fil_spl[3] in acs_data_vars.keys():
        data_dict['20'+fil_spl[1]][acs_data_vars[fil_spl[3]]] = os.path.join(raw_data_dir, fil)
    else:
        data_dict['20'+fil_spl[1]][dec_data_vars[fil_spl[3]]] = os.path.join(raw_data_dir, fil)

In [13]:
def read_census_data():
    parse_geog = lambda x: x.split(',')[0].split()[2]
    
    #income
    income = pd.read_csv(data_dict['2000']['income'], skiprows=1, 
                         usecols=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 
                                  12, 13, 14, 15, 16, 17, 18, 19])
    income.index = income['Geography'].apply(parse_geog)
    income.index.name = 'tract'
    income.columns = ['id2', 'geography', 'total', '<10k', '[10k-15k)', '[15k-20k)',
                      '[20k-25k)', '[25k-30k)', '[30k-35k)', '[35k-40k)', '[40k-45k)',
                      '[45k-50k)', '[50k-60k)', '[60k-75k)', '[75k-100k)', '[100k-125k)',
                      '[125k-150k)', '[150k-200k)', '>200k']
    
    #education
    education = pd.read_csv(data_dict['2000']['education'], skiprows=1,
                            usecols=[3, 4, 20, 23, 26, 29, 32, 35, 38, 41,
                                     44, 47, 50, 53])
    education.index = education['Geography'].apply(parse_geog)
    education.index.name = 'tract'
    education.columns = ['id2', 'geography', 'total', '<5', '5-8', '9-12', 'high_school', 
                         'some_college_1', 'some_college_2', 'associate', 'bachelor', 
                         'master', 'professional', 'doctorate']
    education['<9'] = education['<5'] + education['5-8']
    education['some_college'] = education['some_college_1'] + education['some_college_2']
    education['graduate'] = education['master'] + education['professional'] + education['doctorate']
    education = education[['id2', 'geography', 'total', '<9', '9-12', 'high_school', 'some_college',
                           'associate', 'bachelor', 'graduate']]
    
    #race - percentage white
    race = pd.read_csv(data_dict['2000']['race'], skiprows=1, usecols=[1, 2, 3, 4])
    race.index = race['Geography'].apply(parse_geog)
    race.index.name = 'tract'
    race['percentage_white'] = race['White alone'] / race['Total:'] * 100
    race = race[['Id2', 'Geography', 'Total:', 'percentage_white']]
    race.columns = ['id2', 'geography', 'total', 'percentage_white']
    
    #household - percent families - population density
    household = pd.read_csv(data_dict['2000']['household'], skiprows=1,
                            usecols=[3, 4, 5, 7])
    household.index = household['Geography'].apply(parse_geog)
    household.index.name = 'tract'
    household.columns = ['id2', 'geography', 'total_households', 'family_households']
    household['percent_family'] = household['family_households'] / household['total_households'] * 100
    household = household[['id2', 'geography', 'total_households', 'percent_family']]
    
    #rent
    rent = pd.read_csv(data_dict['2000']['rent'], skiprows=1, 
                       usecols=[1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25])
    rent.index = rent['Geography'].apply(parse_geog)
    rent.index.name = 'tract'
    rent.columns = ['id2', 'geography', 'total', '<100', '[100-150)', '[150-200)', '[200-250)',
                    '[250-300)', '[300-350)', '[350-400)', '[400-450)', '[450-500)', '[500-550)',
                    '[550-600)', '[600-650)', '[650-700)', '[700-750)', '[750-800)', '[800-900)',
                    '[900-1000)', '[1000-1250)', '[1250-1500)', '[1500-2000)', '>2000']
    
    #value - owner occupied home value
    value = pd.read_csv(data_dict['2000']['value'], skiprows=1, 
                        usecols=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 
                                 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27])
    value.index = value['Geography'].apply(parse_geog)
    value.index.name = 'tract'
    value.columns = ['id2', 'geography', 'total', '<10k', '[10k-15k)', '[15k-20k)', '[20k-25k)', 
                     '[25k-30k)', '[30k-35k)', '[35k-40k)', '[40k-50k)', '[50k-60k)', '[60k-70k)', 
                     '[70k-80k)', '[80k-90k)', '[90k-100k)', '[100k-125k)', '[125k-150k)', '[150k-175k)', 
                     '[175k-200k)', '[200k-250k)', '[250k-300k)', '[300k-400k)', '[400k-500k)', 
                     '[500k-750k)', '[750k-1M)', '>1M']
    
    
    return income, education, race, household, rent, value

In [4]:
def read_acs_data(year):
    parse_geog = lambda x: x.split(',')[0].split()[2]
    #income
    income = pd.read_csv(data_dict['2016']['income'], skiprows=1, 
                         usecols=[1, 2, 3, 5, 7, 9, 11, 13, 15, 17, 19,
                                  21, 23, 25, 27, 29, 31, 33, 35])
    income.index = income['Geography'].apply(parse_geog)
    income.index.name = 'tract'
    income.columns = ['id2', 'geography', 'total', '<10k', '[10k-15k)', '[15k-20k)',
                      '[20k-25k)', '[25k-30k)', '[30k-35k)', '[35k-40k)', '[40k-45k)',
                      '[45k-50k)', '[50k-60k)', '[60k-75k)', '[75k-100k)', '[100k-125k)',
                      '[125k-150k)', '[150k-200k)', '>200k']
    #education
    if year in ['2009', '2010', '2011', '2012', '2013']:
        education = pd.read_csv(data_dict[year]['education'], skiprows=1, 
                                usecols=[1, 2, 33, 39, 45, 51, 57, 63, 69, 75])
    elif year in ['2014']:
        education = pd.read_csv(data_dict['2014']['education'], skiprows=1, 
                                usecols=[1, 2, 8, 9, 10, 11, 12, 13, 14, 15])
    else:
        education = pd.read_csv(data_dict['2016']['education'], skiprows=1, 
                                usecols=[1, 2, 13, 15, 17, 19, 21, 23, 25, 27])
    education.index = education['Geography'].apply(parse_geog)
    education.index.name = 'tract'
    education.columns = ['id2', 'geography', 'total', '<9', '9-12', 'high_school', 'some_college',
                         'associate', 'bachelor', 'graduate']
    
    #race
    race = pd.read_csv(data_dict['2016']['race'], skiprows=1, usecols=[1, 2, 3, 5])
    race.index = race['Geography'].apply(parse_geog)
    race.index.name = 'tract'
    race.columns = ['id2', 'geography', 'total', 'white_alone']
    race['percentage_white'] = race['white_alone'] / race['total'] * 100
    race = race[['id2', 'geography', 'total', 'percentage_white']]
    
    #household
    household = pd.read_csv(data_dict[year]['household'], skiprows=1,
                            usecols=[1, 2, 3, 5])
    household.index = household['Geography'].apply(parse_geog)
    household.index.name = 'tract'
    household.columns = ['id2', 'geography', 'total_households', 'family_households']
    household['percent_family'] = household['family_households'] / household['total_households'] * 100
    household = household[['id2', 'geography', 'total_households', 'percent_family']]
    
    #rent
    if year in ['2009', '2010', '2011', '2012', '2013', '2014']:
        rent = pd.read_csv(data_dict[year]['rent'], skiprows=1, 
                   usecols=[1, 2, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 
                            27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47])
    else:
        rent = pd.read_csv(data_dict[year]['rent'], skiprows=1, 
                   usecols=[1, 2, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 
                            27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 
                            51, 53])
        rent['>2000'] = rent[rent.columns[23:27]].sum(axis=1)
        rent.drop(rent.columns[[23, 24, 25, 26]], axis=1, inplace=True)
    rent.index = rent['Geography'].apply(parse_geog)
    rent.index.name = 'tract'
    rent.columns = ['id2', 'geography', 'total', '<100', '[100-150)', '[150-200)', '[200-250)',
                    '[250-300)', '[300-350)', '[350-400)', '[400-450)', '[450-500)', '[500-550)',
                    '[550-600)', '[600-650)', '[650-700)', '[700-750)', '[750-800)', '[800-900)',
                    '[900-1000)', '[1000-1250)', '[1250-1500)', '[1500-2000)', '>2000']
    
    #value
    if year in ['2009', '2010', '2011', '2012', '2013', '2014']:
        value = pd.read_csv(data_dict[year]['value'], skiprows=1, 
                            usecols=[1, 2, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 
                                     29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51])
    else:
        value = pd.read_csv(data_dict[year]['value'], skiprows=1, 
                            usecols=[1, 2, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 
                                     29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55])
        value['>1M'] = value[value.columns[26:29]].sum(axis=1)
        value.drop(value.columns[[26, 27, 28]], axis=1, inplace=True)
    value.index = value['Geography'].apply(parse_geog)
    value.index.name = 'tract'
    value.columns = ['id2', 'geography', 'total', '<10k', '[10k-15k)', '[15k-20k)', '[20k-25k)', 
                     '[25k-30k)', '[30k-35k)', '[35k-40k)', '[40k-50k)', '[50k-60k)', '[60k-70k)', 
                     '[70k-80k)', '[80k-90k)', '[90k-100k)', '[100k-125k)', '[125k-150k)', '[150k-175k)', 
                     '[175k-200k)', '[200k-250k)', '[250k-300k)', '[300k-400k)', '[400k-500k)', 
                     '[500k-750k)', '[750k-1M)', '>1M']
    
    return income, education, race, household, rent, value

In [5]:
geoid_2000 = pd.read_csv(data_dict['2000']['income'], skiprows=1)['Id2']
geoid_2010 = pd.read_csv(data_dict['2016']['income'], skiprows=1)['Id2']

#no_change = list(set(geoid_2000).intersection(geoid_2010))
removed = list(set(geoid_2000) - set(geoid_2010))
added = list(set(geoid_2010) - set(geoid_2000))

In [16]:
tract_relations = pd.read_csv(os.path.join(os.getcwd(), 'census_tract_shapefile', 'census_tract_relation_file.csv'),
                              usecols=[1, 2, 3, 6, 11, 12, 15, 25, 26])
tract_relations = tract_relations[tract_relations['county00'].isin([21, 55, 209, 453, 491])]

need_to_change = tract_relations[tract_relations['geoid00'].isin(removed)]
need_to_change = need_to_change[need_to_change['geoid10'].isin(added)]
need_to_change = need_to_change[need_to_change['poppct00'] > 0]
need_to_change = need_to_change[need_to_change['poppct10'] > 1]

need_to_change.to_csv(os.path.join(os.getcwd(), 'census_tract_shapefile', 'need_to_change.csv'))

In [40]:
#merged - two tracts from the 2000 census merge to form one tract in the 2010 census
merged = need_to_change[need_to_change['poppct00'] == 100]
#split - a tract from the 2000 census is split up into multiple tracts for the 2010 census
split = need_to_change[need_to_change['poppct10'] > 95]