In [136]:
import pandas as pd
import os
import matplotlib.pyplot as plt

%matplotlib inline

In [210]:
#set up the raw data directory (system agnostic)
data_dir = os.path.join(os.getcwd(), 'data', 'raw')

#create a dictionary to store the raw file names by metric
data_dict = {'median_home_value': [],
             'median_income': [],
             'education_attained': []}

#assign each file to the appropriate metric
for raw_file in os.listdir(data_dir):
    if 'B25077' in raw_file:
        data_dict['median_home_value'].append(os.path.join(data_dir, raw_file))
    elif 'S1903' in raw_file:
        data_dict['median_income'].append(os.path.join(data_dir, raw_file))
    elif 'S1501' in raw_file:
        data_dict['education_attained'].append(os.path.join(data_dir, raw_file))
    else:
        print('Unexpected data file {} - skipping'.format(raw_file))

#they should aready be sorted but just in case
for key in data_dict.keys():
    data_dict[key] = sorted(data_dict[key])

In [268]:
#these are all the zip codes in the Austin metroplex
austin_zips = [78610, 78613, 78617, 78641, 78652, 78653, 78660, 78664, 78681, 78701, 
               78702, 78703, 78704, 78705, 78712, 78717, 78719, 78721, 78722, 78723, 
               78724, 78725, 78726, 78727, 78728, 78729, 78730, 78731, 78732, 78733, 
               78734, 78735, 78736, 78737, 78738, 78739, 78741, 78742, 78744, 78745, 
               78746, 78747, 78748, 78749, 78750, 78751, 78752, 78753, 78754, 78756, 
               78757, 78758, 78759]

#the area in the zip code 78712 is all part of the University of Texas (aka "The 40 Acres")
#there is no data for median home value or median income in this zip code so I'm excluding it
#from this study
austin_zips.remove(78712)

#create a list of years which will be used to parse the data
years = [2011, 2012, 2013, 2014, 2015, 2016]

#the education attained dataframe with have multi-indexed columns
#the column will be years and the subcolumns are created below
sub_col = ['<9th', '9th-12th', 'high_school', 'some_college', 
           'associate', 'bachelor', 'graduate']

In [274]:
#create blank dataframes for each dataset
median_home_value = pd.DataFrame(index=austin_zips, columns=years)
median_income = pd.DataFrame(index=austin_zips, columns=years)
education_attained = pd.DataFrame(index=austin_zips, columns=pd.MultiIndex.from_product([years, sub_col]))

#iterate through enumerated years
for i, year in enumerate(years):
    #read the appropriate median home value and median income raw data and assign it to the proper year
    median_home_value[year] = pd.read_csv(data_dict['median_home_value'][i], skiprows=2, usecols=[1, 3],
                                          index_col=0).loc[austin_zips].astype('float')
    median_income[year] = pd.read_csv(data_dict['median_income'][i], skiprows=2, usecols=[1, 5],
                                      index_col=0).loc[austin_zips].astype('float')
    #the education attained raw data for 2015 and 2016 is given as bulk counts whereas the raw
    #data for 2011-2014 are given in percentages. I'll convert the 2015 and 2016 data to 
    #percentages so all the data has the same format
    if year in [2015, 2016]:
        #read the education obtained raw data (including total)
        tmp = pd.read_csv(data_dict['education_attained'][-1], skiprows=2, 
                          usecols=[1, 13, 15, 17, 19, 21, 23, 25, 27], 
                          index_col=0, names=['zip', 'total']+sub_col).loc[austin_zips]
        #convert each column to a percentage by dividing by the total and multiplying by 100
        #I've rounded to the first decimal place for consistency with the 2011-2014 data
        for col in tmp.columns[1:]:
            tmp[col] = (tmp[col] / tmp['total'] * 100).round(1)
        #remove the 'total' column
        tmp = tmp[tmp.columns[1:]]
        #assign the data to the appropriate year
        education_attained[year] = tmp
    else:
        #if years 2011-2014, read the data and assign it to the appropriate year
        education_attained[year] = pd.read_csv(data_dict['education_attained'][i], skiprows=2, 
                                               usecols=[1, 15, 17, 19, 21, 23, 25, 27], index_col=0,
                                               names=['zip']+sub_col).loc[austin_zips]

In [280]:
#set path for saving processed data
processed_data_dir = os.path.join(os.getcwd(), 'data', 'processed')
#save each of the dataframes
median_home_value.to_csv(os.path.join(processed_data_dir, 'median_home_value.csv'))
median_income.to_csv(os.path.join(processed_data_dir, 'median_income.csv'))
education_attained.to_csv(os.path.join(processed_data_dir, 'education_attained.csv'))