## Processing the Census Data By Subject Table at Texas Tract Level

In [1]:
import os
import pandas as pd

### First table is S0101 - Age and Sex

In [2]:
data_dir = 'data/subject_tables/unzipped_files/ACSST5Y2019.S0101_2021-01-12T120250'

In [3]:
S0101_data = pd.read_csv(os.path.join(data_dir, 'ACSST5Y2019.S0101_data_with_overlays_2021-01-08T174020.csv'), low_memory=False)
S0101_metadata = pd.read_csv(os.path.join(data_dir, 'ACSST5Y2019.S0101_metadata_2021-01-08T174020.csv'))

In [4]:
print(f'Shape of data table proper is {S0101_data.shape} \n'
    + f'Shape of metadata table is {S0101_metadata.shape}')

Shape of data table proper is (5266, 458) 
Shape of metadata table is (457, 2)


In [5]:
S0101_data.head(2)

Unnamed: 0,GEO_ID,NAME,S0101_C01_001E,S0101_C01_001M,S0101_C01_002E,S0101_C01_002M,S0101_C01_003E,S0101_C01_003M,S0101_C01_004E,S0101_C01_004M,...,S0101_C06_034E,S0101_C06_034M,S0101_C06_035E,S0101_C06_035M,S0101_C06_036E,S0101_C06_036M,S0101_C06_037E,S0101_C06_037M,S0101_C06_038E,S0101_C06_038M
0,id,Geographic Area Name,Estimate!!Total!!Total population,Margin of Error!!Total!!Total population,Estimate!!Total!!Total population!!AGE!!Under ...,Margin of Error!!Total!!Total population!!AGE!...,Estimate!!Total!!Total population!!AGE!!5 to 9...,Margin of Error!!Total!!Total population!!AGE!...,Estimate!!Total!!Total population!!AGE!!10 to ...,Margin of Error!!Total!!Total population!!AGE!...,...,Estimate!!Percent Female!!Total population!!SU...,Margin of Error!!Percent Female!!Total populat...,Estimate!!Percent Female!!Total population!!SU...,Margin of Error!!Percent Female!!Total populat...,Estimate!!Percent Female!!Total population!!SU...,Margin of Error!!Percent Female!!Total populat...,Estimate!!Percent Female!!Total population!!PE...,Margin of Error!!Percent Female!!Total populat...,Estimate!!Percent Female!!Total population!!PE...,Margin of Error!!Percent Female!!Total populat...
1,1400000US48001950100,"Census Tract 9501, Anderson County, Texas",4844,524,349,131,269,119,372,137,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)


The name column is pretty irrelevant to our purposes and we can drop it so there there is an exact correspondence between
our columns in the data table and the rows in the metadata table.

In [6]:
S0101_data.drop(columns=['NAME'], inplace=True)
assert (S0101_data.shape[1] == S0101_metadata.shape[0]), print('mismatch in metadata and data correspondence')

In [7]:
int_locations = [2, *(list(range(14,28,2))), 42, 46, 56, 230, 306] 
int_locations = [x-1 for x in int_locations]

In [8]:
S0101_metadata.iloc[int_locations]

Unnamed: 0,GEO_ID,id
1,S0101_C01_001E,Estimate!!Total!!Total population
13,S0101_C01_007E,Estimate!!Total!!Total population!!AGE!!25 to ...
15,S0101_C01_008E,Estimate!!Total!!Total population!!AGE!!30 to ...
17,S0101_C01_009E,Estimate!!Total!!Total population!!AGE!!35 to ...
19,S0101_C01_010E,Estimate!!Total!!Total population!!AGE!!40 to ...
21,S0101_C01_011E,Estimate!!Total!!Total population!!AGE!!45 to ...
23,S0101_C01_012E,Estimate!!Total!!Total population!!AGE!!50 to ...
25,S0101_C01_013E,Estimate!!Total!!Total population!!AGE!!55 to ...
41,S0101_C01_021E,Estimate!!Total!!Total population!!SELECTED AG...
45,S0101_C01_023E,Estimate!!Total!!Total population!!SELECTED AG...


In [9]:
wanted_columns = S0101_metadata.GEO_ID.to_list()

In [10]:
wanted_columns

['NAME',
 'S0101_C01_001E',
 'S0101_C01_001M',
 'S0101_C01_002E',
 'S0101_C01_002M',
 'S0101_C01_003E',
 'S0101_C01_003M',
 'S0101_C01_004E',
 'S0101_C01_004M',
 'S0101_C01_005E',
 'S0101_C01_005M',
 'S0101_C01_006E',
 'S0101_C01_006M',
 'S0101_C01_007E',
 'S0101_C01_007M',
 'S0101_C01_008E',
 'S0101_C01_008M',
 'S0101_C01_009E',
 'S0101_C01_009M',
 'S0101_C01_010E',
 'S0101_C01_010M',
 'S0101_C01_011E',
 'S0101_C01_011M',
 'S0101_C01_012E',
 'S0101_C01_012M',
 'S0101_C01_013E',
 'S0101_C01_013M',
 'S0101_C01_014E',
 'S0101_C01_014M',
 'S0101_C01_015E',
 'S0101_C01_015M',
 'S0101_C01_016E',
 'S0101_C01_016M',
 'S0101_C01_017E',
 'S0101_C01_017M',
 'S0101_C01_018E',
 'S0101_C01_018M',
 'S0101_C01_019E',
 'S0101_C01_019M',
 'S0101_C01_020E',
 'S0101_C01_020M',
 'S0101_C01_021E',
 'S0101_C01_021M',
 'S0101_C01_022E',
 'S0101_C01_022M',
 'S0101_C01_023E',
 'S0101_C01_023M',
 'S0101_C01_024E',
 'S0101_C01_024M',
 'S0101_C01_025E',
 'S0101_C01_025M',
 'S0101_C01_026E',
 'S0101_C01_026M',
 'S

In [11]:
S0101_data.columns

Index(['GEO_ID', 'S0101_C01_001E', 'S0101_C01_001M', 'S0101_C01_002E',
       'S0101_C01_002M', 'S0101_C01_003E', 'S0101_C01_003M', 'S0101_C01_004E',
       'S0101_C01_004M', 'S0101_C01_005E',
       ...
       'S0101_C06_034E', 'S0101_C06_034M', 'S0101_C06_035E', 'S0101_C06_035M',
       'S0101_C06_036E', 'S0101_C06_036M', 'S0101_C06_037E', 'S0101_C06_037M',
       'S0101_C06_038E', 'S0101_C06_038M'],
      dtype='object', length=457)

In [12]:
S0101_data = S0101_data[[wanted_columns]]

KeyError: "None of [Index([('NAME', 'S0101_C01_001E', 'S0101_C01_001M', 'S0101_C01_002E', 'S0101_C01_002M', 'S0101_C01_003E', 'S0101_C01_003M', 'S0101_C01_004E', 'S0101_C01_004M', 'S0101_C01_005E', 'S0101_C01_005M', 'S0101_C01_006E', 'S0101_C01_006M', 'S0101_C01_007E', 'S0101_C01_007M', 'S0101_C01_008E', 'S0101_C01_008M', 'S0101_C01_009E', 'S0101_C01_009M', 'S0101_C01_010E', 'S0101_C01_010M', 'S0101_C01_011E', 'S0101_C01_011M', 'S0101_C01_012E', 'S0101_C01_012M', 'S0101_C01_013E', 'S0101_C01_013M', 'S0101_C01_014E', 'S0101_C01_014M', 'S0101_C01_015E', 'S0101_C01_015M', 'S0101_C01_016E', 'S0101_C01_016M', 'S0101_C01_017E', 'S0101_C01_017M', 'S0101_C01_018E', 'S0101_C01_018M', 'S0101_C01_019E', 'S0101_C01_019M', 'S0101_C01_020E', 'S0101_C01_020M', 'S0101_C01_021E', 'S0101_C01_021M', 'S0101_C01_022E', 'S0101_C01_022M', 'S0101_C01_023E', 'S0101_C01_023M', 'S0101_C01_024E', 'S0101_C01_024M', 'S0101_C01_025E', 'S0101_C01_025M', 'S0101_C01_026E', 'S0101_C01_026M', 'S0101_C01_027E', 'S0101_C01_027M', 'S0101_C01_028E', 'S0101_C01_028M', 'S0101_C01_029E', 'S0101_C01_029M', 'S0101_C01_030E', 'S0101_C01_030M', 'S0101_C01_031E', 'S0101_C01_031M', 'S0101_C01_032E', 'S0101_C01_032M', 'S0101_C01_033E', 'S0101_C01_033M', 'S0101_C01_034E', 'S0101_C01_034M', 'S0101_C01_035E', 'S0101_C01_035M', 'S0101_C01_036E', 'S0101_C01_036M', 'S0101_C01_037E', 'S0101_C01_037M', 'S0101_C01_038E', 'S0101_C01_038M', 'S0101_C02_001E', 'S0101_C02_001M', 'S0101_C02_002E', 'S0101_C02_002M', 'S0101_C02_003E', 'S0101_C02_003M', 'S0101_C02_004E', 'S0101_C02_004M', 'S0101_C02_005E', 'S0101_C02_005M', 'S0101_C02_006E', 'S0101_C02_006M', 'S0101_C02_007E', 'S0101_C02_007M', 'S0101_C02_008E', 'S0101_C02_008M', 'S0101_C02_009E', 'S0101_C02_009M', 'S0101_C02_010E', 'S0101_C02_010M', 'S0101_C02_011E', 'S0101_C02_011M', 'S0101_C02_012E', ...)], dtype='object')] are in the [columns]"