In [1]:
import pandas as pd
import os

In [2]:
raw_data_dir = os.path.join(os.getcwd(), 'data', 'raw')

empty_vars = {'income': '', 'education': '', 'race': '',
              'household': '', 'rent': '', 'value': ''}

acs_data_vars = {'B19001': 'income', 'S1501': 'education', 'B02001': 'race',
                 'B11001': 'household', 'B25063': 'rent', 'B25075': 'value'}

dec_data_vars = {'P052': 'income', 'QTP20': 'education', 'P007': 'race',
                 'QTP10': 'household', 'H062': 'rent', 'H084': 'value'}

data_dict = {
    '2000': empty_vars.copy(),
    '2009': empty_vars.copy(),
    '2010': empty_vars.copy(),
    '2011': empty_vars.copy(),
    '2012': empty_vars.copy(),
    '2013': empty_vars.copy(),
    '2014': empty_vars.copy(),
    '2015': empty_vars.copy(),
    '2016': empty_vars.copy()
}

data = data_dict.copy()

for fil in os.listdir(raw_data_dir):
    fil_spl = fil.split('_')
    if fil_spl[3] in acs_data_vars.keys():
        data_dict['20'+fil_spl[1]][acs_data_vars[fil_spl[3]]] = os.path.join(raw_data_dir, fil)
    elif fil_spl[3] in dec_data_vars.keys():
        data_dict['20'+fil_spl[1]][dec_data_vars[fil_spl[3]]] = os.path.join(raw_data_dir, fil)
    else:
        print('Unexpected file not processed: {}'.format(fil))

In [7]:
geoid_2000 = pd.read_csv(data_dict['2000']['income'], skiprows=1)['Id2']
geoid_2010 = pd.read_csv(data_dict['2016']['income'], skiprows=1)['Id2']

removed = list(set(geoid_2000) - set(geoid_2010))
added = list(set(geoid_2010) - set(geoid_2000))

tract_relations = pd.read_csv(os.path.join(os.getcwd(), 'census_tract_shapefile', 'census_tract_relation_file.csv'),
                              usecols=[1, 2, 3, 6, 11, 12, 15, 25, 26])
tract_relations = tract_relations[tract_relations['county00'].isin([21, 55, 209, 453, 491])]

need_to_change = tract_relations[tract_relations['geoid00'].isin(removed)]
need_to_change = need_to_change[need_to_change['geoid10'].isin(added)]
need_to_change = need_to_change[need_to_change['poppct00'] > 0]
need_to_change = need_to_change[need_to_change['poppct10'] > 1]

need_to_change.to_csv(os.path.join(os.getcwd(), 'census_tract_shapefile', 'need_to_change.csv'))

merged = need_to_change[need_to_change['poppct00'] > 95]
split = need_to_change[need_to_change['poppct10'] > 95]

In [8]:
def fix_tracts(df):
    pass

In [None]:
def fill_missing(data):
    '''
    data should be a dictionary of dataframes
    '''

In [9]:
def read_income(year):
    if year == '2000':
        usecols = [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 
                   12, 13, 14, 15, 16, 17, 18, 19]
    else:
        usecols=[1, 3, 5, 7, 9, 11, 13, 15, 17, 19,
                 21, 23, 25, 27, 29, 31, 33, 35]
    income = pd.read_csv(data_dict[year]['income'], skiprows=1, usecols=usecols)
    income.index = income.pop('Id2')
    income.index.name = 'geoid'
    income.columns = ['total', '<10k', '[10k-15k)', '[15k-20k)',
                      '[20k-25k)', '[25k-30k)', '[30k-35k)', '[35k-40k)', '[40k-45k)',
                      '[45k-50k)', '[50k-60k)', '[60k-75k)', '[75k-100k)', '[100k-125k)',
                      '[125k-150k)', '[150k-200k)', '>200k']
    return income

In [55]:
def read_education(year):
    if year == '2000':
        usecols = [3, 20, 23, 26, 29, 32, 35, 38, 41, 44, 47, 50, 53]
    elif year in ['2009', '2010', '2011', '2012', '2013']:
        usecols = [1, 33, 39, 45, 51, 57, 63, 69, 75]
    elif year == '2014':
        usecols = [1, 8, 9, 10, 11, 12, 13, 14, 15]
    else:
        usecols = [1, 13, 15, 17, 19, 21, 23, 25, 27]
    
    education = pd.read_csv(data_dict[year]['education'], skiprows=1, usecols=usecols, na_values=['-'])
    education.index = education.pop('Id2')
    education.index.name = 'geoid'
    
    if year == '2000':
        education.columns = ['total', '<5', '5-8', '9-12', 'high_school', 'some_college_1', 
                             'some_college_2', 'associate', 'bachelor', 'master', 
                             'professional', 'doctorate']
        education['<9'] = education.pop('<5') + education.pop('5-8')
        education['some_college'] = education.pop('some_college_1') + education.pop('some_college_2')
        education['graduate'] = education.pop('master') + education.pop('professional') \
                                + education.pop('doctorate')
        education = education[['total', '<9', '9-12', 'high_school', 'some_college',
                               'associate', 'bachelor', 'graduate']]
    
    elif year in ['2015', '2016']:
        education.columns = ['total', '<9', '9-12', 'high_school', 'some_college',
                             'associate', 'bachelor', 'graduate']
    
    else:
        education.columns = ['total', '<9', '9-12', 'high_school', 'some_college',
                             'associate', 'bachelor', 'graduate']
        education.fillna(0, inplace=True) #not actually missing - Austin-Bergrstrom Intl. Airport
        perc_cols = education.columns[1:]
        for c in perc_cols:
            education[c] = round(education[c] / 100 * education['total'], 0).astype('int')
        
            
    return education

In [56]:
read_education('2010').info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 350 entries, 48021950100 to 48491021603
Data columns (total 8 columns):
total           350 non-null int64
<9              350 non-null int64
9-12            350 non-null int64
high_school     350 non-null int64
some_college    350 non-null int64
associate       350 non-null int64
bachelor        350 non-null int64
graduate        350 non-null int64
dtypes: int64(8)
memory usage: 24.6 KB
