In [1]:
import os
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)

### First table is S0101 - Age and Sex

In [2]:
# path to the S0101 subject table
data_dir = 'data/subject_tables/unzipped_files/ACSST5Y2019.S0101_2021-01-12T120250'

In [3]:
S0101_data = pd.read_csv(os.path.join(data_dir, 'ACSST5Y2019.S0101_data_with_overlays_2021-01-08T174020.csv'), low_memory=False)
S0101_metadata = pd.read_csv(os.path.join(data_dir, 'ACSST5Y2019.S0101_metadata_2021-01-08T174020.csv'))

In [4]:
print(f'Shape of data table proper is {S0101_data.shape} \n'
    + f'Shape of metadata table is {S0101_metadata.shape}')

Shape of data table proper is (5266, 458) 
Shape of metadata table is (457, 2)


In [5]:
S0101_data.head(2)

Unnamed: 0,GEO_ID,NAME,S0101_C01_001E,S0101_C01_001M,S0101_C01_002E,S0101_C01_002M,S0101_C01_003E,S0101_C01_003M,S0101_C01_004E,S0101_C01_004M,...,S0101_C06_034E,S0101_C06_034M,S0101_C06_035E,S0101_C06_035M,S0101_C06_036E,S0101_C06_036M,S0101_C06_037E,S0101_C06_037M,S0101_C06_038E,S0101_C06_038M
0,id,Geographic Area Name,Estimate!!Total!!Total population,Margin of Error!!Total!!Total population,Estimate!!Total!!Total population!!AGE!!Under 5 years,Margin of Error!!Total!!Total population!!AGE!!Under 5 years,Estimate!!Total!!Total population!!AGE!!5 to 9 years,Margin of Error!!Total!!Total population!!AGE!!5 to 9 years,Estimate!!Total!!Total population!!AGE!!10 to 14 years,Margin of Error!!Total!!Total population!!AGE!!10 to 14 years,...,Estimate!!Percent Female!!Total population!!SUMMARY INDICATORS!!Age dependency ratio,Margin of Error!!Percent Female!!Total population!!SUMMARY INDICATORS!!Age dependency ratio,Estimate!!Percent Female!!Total population!!SUMMARY INDICATORS!!Old-age dependency ratio,Margin of Error!!Percent Female!!Total population!!SUMMARY INDICATORS!!Old-age dependency ratio,Estimate!!Percent Female!!Total population!!SUMMARY INDICATORS!!Child dependency ratio,Margin of Error!!Percent Female!!Total population!!SUMMARY INDICATORS!!Child dependency ratio,Estimate!!Percent Female!!Total population!!PERCENT ALLOCATED!!Sex,Margin of Error!!Percent Female!!Total population!!PERCENT ALLOCATED!!Sex,Estimate!!Percent Female!!Total population!!PERCENT ALLOCATED!!Age,Margin of Error!!Percent Female!!Total population!!PERCENT ALLOCATED!!Age
1,1400000US48001950100,"Census Tract 9501, Anderson County, Texas",4844,524,349,131,269,119,372,137,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)


In [6]:
S0101_metadata.head(2)

Unnamed: 0,GEO_ID,id
0,NAME,Geographic Area Name
1,S0101_C01_001E,Estimate!!Total!!Total population


In [7]:
# A dictionary to keep track of the original columns and the first row of df which is text explanation of column values.
original_column_dictionary = dict(zip(S0101_data.columns, S0101_data.iloc[0,:]))

We should break apart the GEOID field into its FIPS components for easier matching of datasets.

FIPS codes are 11 digits long. 2 state digits, 3 county digits, 6 tract digits.

In the GEO_ID column the FIPS code comes after the `US` characters.

In [8]:
# Breaking apart GEO_ID column into the FIPS components and creating new columns for them.
# Split breaks the string into a list and then we specify the second part and which chars we want from that part
# the conditional logic is because of the pesky 'id' value in row 0, we keep it for now 
S0101_data["STATE"] = S0101_data.GEO_ID.apply(lambda x: x.split('US')[1][:2] if len(x) > 3 else np.nan)
S0101_data["COUNTYFP"] = S0101_data.GEO_ID.apply(lambda x: x.split('US')[1][2:5] if len(x) > 3 else np.nan)
S0101_data["TRACTCE"] = S0101_data.GEO_ID.apply(lambda x: x.split('US')[1][5:] if len(x) > 3 else np.nan)

In [9]:
print(err)

NameError: name 'err' is not defined

Note: these will have iloc indicies of 458, 459, and 460. Important to remember for later.

In [10]:
# Visual check to make sure these are properly formatted
S0101_data[['STATE', 'COUNTYFP', 'TRACTCE']].head(5)

Unnamed: 0,STATE,COUNTYFP,TRACTCE
0,,,
1,48.0,1.0,950100.0
2,48.0,1.0,950401.0
3,48.0,1.0,950402.0
4,48.0,1.0,950500.0


The rows in the metadata table provide a mapping to the columns in the actual data table, but there are some minor offsets
to be made in order to make matching indices (i.e. row 1 in metadata table corresponds to column 1).

If we drop the GEO_ID and NAME columns in the dataset and drop the first row of the metadataset then they will be aligned.

In [None]:
S0101_data.drop(columns=['GEO_ID', 'NAME'], inplace=True)
S0101_metadata.drop(index=0, axis=0, inplace=True)
S0101_metadata.reset_index(drop=True, inplace=True)

# Subtracting 3 to S0101_data.shape[1] to account for the our creation of 3 new columns
assert (S0101_data.shape[1]-3 == S0101_metadata.shape[0]), print('mismatch in metadata and data correspondence')

In [None]:
int_locations = [3, *(list(range(15,30,2))), 43, 47, 61, 155, 307] # These are the rows as they appear in metadata excel sheet
int_locations = [x-3 for x in int_locations] # Offsetting these indicies to account for drops, header and 0-index pandas scheme

In [None]:
# Checking to make sure we have specified the columns we want.
S0101_metadata.iloc[int_locations]

In [None]:
# Creating a list of the column names we want so we can use this to filter our data set.
wanted_columns = S0101_metadata.GEO_ID.iloc[int_locations].to_list()

In [None]:
# Need to account for the 3 columns we created by breaking up GEO_ID that have no mapping to metdata df.
additional_columns = list(S0101_data.columns[-3:])
wanted_columns.extend(additional_columns)

In [None]:
wanted_columns

In [None]:
S0101_data = S0101_data[wanted_columns]

In [None]:
S0101_data.head(5)