In [1]:
import pandas as pd
import os

In [22]:
'''
Explanation of data variables:

    ACS - American Community Survey 5-year estimates
    DEC - Decennial Census 

    income:    Household Income (Brackets)
        B19001 (ACS 2009 - 2016)
        P052   (DEC 2000)
        
    education: Education Attained (Brackets - Percentages)
        S1501  (ACS 2009 -2016)
        QTP20  (DEC 2000)
        
    race:      Percent White
        B02001 (ACS 2009 - 2016)
        P007   (DEC 2000)
        
    household: Percent Families (approximation for population density)
        B11001 (ACS 2009 - 2016)
        QTP10  (DEC 2000)
        
    rent:      Gross Rent (Brackets)
        B25063 (ACS 2009 - 2016)
        H062   (DEC 2000)
        
    age:       Age of Structure (year built)
        B25034 (ACS 2009 - 2016)
        H034   (DEC 2000)
'''

raw_data_dir = os.path.join(os.getcwd(), 'data', 'raw')

acs_data_vars = {'income': 'B19001', 'education': 'S1501', 'race': 'B02001',
                 'household': 'B11001', 'rent': 'B25063', 'age': 'B25034'}
dec_data_vars = {'income': 'P025', 'education': 'PTP20', 'race': 'P007',
                 'household': 'QTP10', 'rent': 'H062', 'age': 'H034'}

data_dict = {'2000': dec_data_vars.copy(), 
             '2009': acs_data_vars.copy(), 
             '2010': acs_data_vars.copy(), 
             '2011': acs_data_vars.copy(), 
             '2012': acs_data_vars.copy(), 
             '2013': acs_data_vars.copy(), 
             '2014': acs_data_vars.copy(), 
             '2015': acs_data_vars.copy(), 
             '2016': acs_data_vars.copy()}

for fil in os.listdir(raw_data_dir):
    for key in data_dict:
        if fil.split('_')[1] == key[-2:]:
            data_dict[key]

In [37]:
raw_data_dir = os.path.join(os.getcwd(), 'data', 'raw')

empty_vars = {'income': '', 'education': '', 'race': '',
                   'household': '', 'rent': '', 'age': ''}

acs_data_vars = {'B19001': 'income', 'S1501': 'education', 'B02001': 'race',
                 'B11001': 'household', 'B25063': 'rent', 'B25034': 'age'}

dec_data_vars = {'P052': 'income', 'QTP20': 'education', 'P007': 'race',
                 'QTP10': 'household', 'H062': 'rent', 'H034': 'age'}

data_dict = {
    '2000': empty_vars.copy(),
    '2009': empty_vars.copy(),
    '2010': empty_vars.copy(),
    '2011': empty_vars.copy(),
    '2012': empty_vars.copy(),
    '2013': empty_vars.copy(),
    '2014': empty_vars.copy(),
    '2015': empty_vars.copy(),
    '2016': empty_vars.copy()
}

for fil in os.listdir(raw_data_dir):
    fil_spl = fil.split('_')
    if fil_spl[3] in acs_data_vars.keys():
        data_dict['20'+fil_spl[1]][acs_data_vars[fil_spl[3]]] = os.path.join(raw_data_dir, fil)
    else:
        data_dict['20'+fil_spl[1]][dec_data_vars[fil_spl[3]]] = os.path.join(raw_data_dir, fil)

In [40]:
def read_data(year):
    income = pd.read_csv(data_dict[year]['income'])
    

In [48]:
pd.read_csv(data_dict['2000']['income'], skiprows=2, 
            usecols=[1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,33, 35])

ValueError: Usecols do not match names.

In [75]:
data = pd.read_csv(data_dict['2000']['income'], skiprows=1, 
                   usecols=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])

data.head()

Unnamed: 0,Id2,Geography,Total:,"Less than $10,000","$10,000 to $14,999","$15,000 to $19,999","$20,000 to $24,999","$25,000 to $29,999","$30,000 to $34,999","$35,000 to $39,999","$40,000 to $44,999","$45,000 to $49,999","$50,000 to $59,999","$60,000 to $74,999","$75,000 to $99,999","$100,000 to $124,999","$125,000 to $149,999","$150,000 to $199,999","$200,000 or more"
0,48021950100,"Census Tract 9501, Bastrop County, Texas",2577,219,75,154,227,142,89,155,174,172,297,326,343,133,19,28,24
1,48021950200,"Census Tract 9502, Bastrop County, Texas",2053,286,131,137,141,120,108,160,71,100,252,236,181,30,22,40,38
2,48021950300,"Census Tract 9503, Bastrop County, Texas",2755,168,106,198,143,131,218,167,167,160,253,447,267,198,46,55,31
3,48021950400,"Census Tract 9504, Bastrop County, Texas",2760,262,216,129,145,155,172,183,138,119,200,440,326,131,69,35,40
4,48021950500,"Census Tract 9505, Bastrop County, Texas",3602,285,205,149,258,238,242,100,223,241,446,521,361,139,83,79,32


In [78]:
data['Geography'].astype('str').str.split(' |,', 2)

0           [Census, Tract, 9501, Bastrop County, Texas]
1           [Census, Tract, 9502, Bastrop County, Texas]
2           [Census, Tract, 9503, Bastrop County, Texas]
3           [Census, Tract, 9504, Bastrop County, Texas]
4           [Census, Tract, 9505, Bastrop County, Texas]
5           [Census, Tract, 9506, Bastrop County, Texas]
6           [Census, Tract, 9507, Bastrop County, Texas]
7           [Census, Tract, 9508, Bastrop County, Texas]
8          [Census, Tract, 9601, Caldwell County, Texas]
9          [Census, Tract, 9602, Caldwell County, Texas]
10         [Census, Tract, 9603, Caldwell County, Texas]
11         [Census, Tract, 9604, Caldwell County, Texas]
12         [Census, Tract, 9605, Caldwell County, Texas]
13         [Census, Tract, 9606, Caldwell County, Texas]
14         [Census, Tract, 9607, Caldwell County, Texas]
15              [Census, Tract, 101, Hays County, Texas]
16              [Census, Tract, 102, Hays County, Texas]
17           [Census, Tract, 10