In [1]:
import os
import numpy as np
import pandas as pd
import geopandas as gpd
from ld_script import *
pd.set_option('display.max_colwidth', None)

### First table is S0101 - Age and Sex

In [2]:
# path to the S0101 subject table
data_dir = 'data/subject_tables/unzipped_files/ACSST5Y2019.S0101_2021-01-12T120250'

In [3]:
# importing a file that contains the full fips codes for texas at tract level to ensure data integrity.
tract_checker = pd.read_csv('data/tx_tract_fips.csv')

In [4]:
S0101_data = pd.read_csv(os.path.join(data_dir, 'ACSST5Y2019.S0101_data_with_overlays_2021-01-08T174020.csv'), low_memory=False)
S0101_metadata = pd.read_csv(os.path.join(data_dir, 'ACSST5Y2019.S0101_metadata_2021-01-08T174020.csv'))

In [5]:
print(f'Shape of data table proper is {S0101_data.shape} \n'
    + f'Shape of metadata table is {S0101_metadata.shape}')

Shape of data table proper is (5266, 458) 
Shape of metadata table is (457, 2)


In [6]:
S0101_data.head(2)

Unnamed: 0,GEO_ID,NAME,S0101_C01_001E,S0101_C01_001M,S0101_C01_002E,S0101_C01_002M,S0101_C01_003E,S0101_C01_003M,S0101_C01_004E,S0101_C01_004M,...,S0101_C06_034E,S0101_C06_034M,S0101_C06_035E,S0101_C06_035M,S0101_C06_036E,S0101_C06_036M,S0101_C06_037E,S0101_C06_037M,S0101_C06_038E,S0101_C06_038M
0,id,Geographic Area Name,Estimate!!Total!!Total population,Margin of Error!!Total!!Total population,Estimate!!Total!!Total population!!AGE!!Under 5 years,Margin of Error!!Total!!Total population!!AGE!!Under 5 years,Estimate!!Total!!Total population!!AGE!!5 to 9 years,Margin of Error!!Total!!Total population!!AGE!!5 to 9 years,Estimate!!Total!!Total population!!AGE!!10 to 14 years,Margin of Error!!Total!!Total population!!AGE!!10 to 14 years,...,Estimate!!Percent Female!!Total population!!SUMMARY INDICATORS!!Age dependency ratio,Margin of Error!!Percent Female!!Total population!!SUMMARY INDICATORS!!Age dependency ratio,Estimate!!Percent Female!!Total population!!SUMMARY INDICATORS!!Old-age dependency ratio,Margin of Error!!Percent Female!!Total population!!SUMMARY INDICATORS!!Old-age dependency ratio,Estimate!!Percent Female!!Total population!!SUMMARY INDICATORS!!Child dependency ratio,Margin of Error!!Percent Female!!Total population!!SUMMARY INDICATORS!!Child dependency ratio,Estimate!!Percent Female!!Total population!!PERCENT ALLOCATED!!Sex,Margin of Error!!Percent Female!!Total population!!PERCENT ALLOCATED!!Sex,Estimate!!Percent Female!!Total population!!PERCENT ALLOCATED!!Age,Margin of Error!!Percent Female!!Total population!!PERCENT ALLOCATED!!Age
1,1400000US48001950100,"Census Tract 9501, Anderson County, Texas",4844,524,349,131,269,119,372,137,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)


In [7]:
S0101_metadata.head(2)

Unnamed: 0,GEO_ID,id
0,NAME,Geographic Area Name
1,S0101_C01_001E,Estimate!!Total!!Total population


We should break apart the GEOID field into its FIPS components for easier matching of datasets.

FIPS codes are 11 digits long. 2 state digits, 3 county digits, 6 tract digits.

In the GEO_ID column the FIPS code comes after the `US` characters.

In [8]:
# Custom function defined in ld_scripts.py in project folder to break apart GEO_ID column appropriately
S0101_data = parse_geo_id(S0101_data)

In [9]:
# A dictionary to keep track of the original columns and the first row of df which is text explanation of column values.
original_column_dictionary = dict(zip(S0101_data.columns, S0101_data.iloc[0,:]))

Note: these will have iloc indicies of 458, 459, 460, 461. Important to remember for later.

In [10]:
# Visual check to make sure these are properly formatted
S0101_data[['FIPS', 'STATEFP', 'COUNTYFP', 'TRACTCE']].head(5)

Unnamed: 0,FIPS,STATEFP,COUNTYFP,TRACTCE
0,fips,statefp,countyfp,tractce
1,48001950100,48,001,950100
2,48001950401,48,001,950401
3,48001950402,48,001,950402
4,48001950500,48,001,950500


The rows in the metadata table provide a mapping to the columns in the actual data table, but there are some minor offsets
to be made in order to make matching indices (i.e. row 1 in metadata table corresponds to column 1).

If we drop the GEO_ID and NAME columns in the dataset and drop the first row of the metadataset then they will be aligned.

In [11]:
S0101_data.drop(columns=['GEO_ID', 'NAME'], inplace=True)
S0101_metadata.drop(index=0, axis=0, inplace=True)
S0101_metadata.reset_index(drop=True, inplace=True)

# Subtracting 3 to S0101_data.shape[1] to account for the our creation of 3 new columns
assert (S0101_data.shape[1]-4 == S0101_metadata.shape[0]), print('mismatch in metadata and data correspondence')

In [12]:
int_locations = [3, *(list(range(15,30,2))), 43, 47, 61, 155, 307] # These are the rows as they appear in metadata excel sheet
int_locations = [x-3 for x in int_locations] # Offsetting these indicies to account for drops, header and 0-index pandas scheme

In [13]:
# Checking to make sure we have specified the columns we want.
S0101_metadata.iloc[int_locations]

Unnamed: 0,GEO_ID,id
0,S0101_C01_001E,Estimate!!Total!!Total population
12,S0101_C01_007E,Estimate!!Total!!Total population!!AGE!!25 to 29 years
14,S0101_C01_008E,Estimate!!Total!!Total population!!AGE!!30 to 34 years
16,S0101_C01_009E,Estimate!!Total!!Total population!!AGE!!35 to 39 years
18,S0101_C01_010E,Estimate!!Total!!Total population!!AGE!!40 to 44 years
20,S0101_C01_011E,Estimate!!Total!!Total population!!AGE!!45 to 49 years
22,S0101_C01_012E,Estimate!!Total!!Total population!!AGE!!50 to 54 years
24,S0101_C01_013E,Estimate!!Total!!Total population!!AGE!!55 to 59 years
26,S0101_C01_014E,Estimate!!Total!!Total population!!AGE!!60 to 64 years
40,S0101_C01_021E,Estimate!!Total!!Total population!!SELECTED AGE CATEGORIES!!15 to 17 years


In [14]:
# Creating a list of the column names we want so we can use this to filter our data set.
wanted_columns = S0101_metadata.GEO_ID.iloc[int_locations].to_list()

In [15]:
# Need to account for the 3 columns we created by breaking up GEO_ID that have no mapping to metdata df.
additional_columns = list(S0101_data.columns[-4:])
wanted_columns.extend(additional_columns)

In [16]:
wanted_columns

['S0101_C01_001E',
 'S0101_C01_007E',
 'S0101_C01_008E',
 'S0101_C01_009E',
 'S0101_C01_010E',
 'S0101_C01_011E',
 'S0101_C01_012E',
 'S0101_C01_013E',
 'S0101_C01_014E',
 'S0101_C01_021E',
 'S0101_C01_023E',
 'S0101_C01_030E',
 'S0101_C03_001E',
 'S0101_C05_001E',
 'FIPS',
 'STATEFP',
 'COUNTYFP',
 'TRACTCE']

In [17]:
S0101_data = S0101_data[wanted_columns]

In [18]:
S0101_data.head(3)

Unnamed: 0,S0101_C01_001E,S0101_C01_007E,S0101_C01_008E,S0101_C01_009E,S0101_C01_010E,S0101_C01_011E,S0101_C01_012E,S0101_C01_013E,S0101_C01_014E,S0101_C01_021E,S0101_C01_023E,S0101_C01_030E,S0101_C03_001E,S0101_C05_001E,FIPS,STATEFP,COUNTYFP,TRACTCE
0,Estimate!!Total!!Total population,Estimate!!Total!!Total population!!AGE!!25 to 29 years,Estimate!!Total!!Total population!!AGE!!30 to 34 years,Estimate!!Total!!Total population!!AGE!!35 to 39 years,Estimate!!Total!!Total population!!AGE!!40 to 44 years,Estimate!!Total!!Total population!!AGE!!45 to 49 years,Estimate!!Total!!Total population!!AGE!!50 to 54 years,Estimate!!Total!!Total population!!AGE!!55 to 59 years,Estimate!!Total!!Total population!!AGE!!60 to 64 years,Estimate!!Total!!Total population!!SELECTED AGE CATEGORIES!!15 to 17 years,Estimate!!Total!!Total population!!SELECTED AGE CATEGORIES!!18 to 24 years,Estimate!!Total!!Total population!!SELECTED AGE CATEGORIES!!65 years and over,Estimate!!Male!!Total population,Estimate!!Female!!Total population,fips,statefp,countyfp,tractce
1,4844,252,197,335,163,312,447,318,318,250,205,1057,2486,2358,48001950100,48,001,950100
2,4838,843,907,753,444,419,314,223,73,0,634,84,4658,180,48001950401,48,001,950401


In [19]:
new_column_dictionary = dict(zip(S0101_data.columns, S0101_data.iloc[0]))

In [20]:
S0101_data.drop(index=0, axis=0, inplace=True)
S0101_data.reset_index(drop=True, inplace=True)

In [21]:
S0101_data.shape

(5265, 18)

In [None]:
S0101_data.head(3)

Reorder the columns and then compress the bins if necessary to align with our wanted columns.

In [None]:
curr_columns = list(S0101_data.columns)

In [None]:
for i in curr_columns:
    print(f'Column {i} is {new_column_dictionary[i]}')

One way we might be able to automate this is by using the last two entries in the list created by splitting with `!!`.
We can use the second to last entry as our keyword and then use the numeric characters in strings for ordering.

In [None]:
# Manual reordering might be necessary here
new_columns = ['S0101_C01_001E',
    'S0101_C01_021E',
    'S0101_C01_023E',
    'S0101_C01_007E',
    'S0101_C01_008E',
    'S0101_C01_009E',
    'S0101_C01_010E',
    'S0101_C01_011E',
    'S0101_C01_012E',
    'S0101_C01_013E',
    'S0101_C01_014E', 
    'S0101_C01_030E',
    'S0101_C03_001E',
    'S0101_C05_001E',
    'STATE',
    'COUNTYFP',
    'TRACTCE']

In [None]:
# Checking to ensure we have the ordering we want.
for n, i in enumerate(new_columns):
    print(f'Column {n} is named {i} : {new_column_dictionary[i]}')

In [None]:
# Reorder columns using updated order
S0101_data = S0101_data[new_columns]

Now we need to decide which columns to merge together into age bins, since we know certain age groups 
do not need such a tight bucket and so we cut down on some features by collapsing columns together.

How to rename them then? Perhaps just modifying the very last part of column name.

In [None]:
Age_25_24 = S0101_data.S0101_C01_007E

In [None]:
S0101_data.head()

In [None]:
checker_series = S0101_data[['S0101_C01_007E','S0101_C01_008E']].copy()
# checker_series.sum(axis=1)

In [None]:
S0101_data[list(set(S0101_data.columns).difference(set(('STATE', 'COUNTYFP', 'TRACTCE'))))] = \
S0101_data[list(set(S0101_data.columns).difference(set(('STATE', 'COUNTYFP', 'TRACTCE'))))].apply(pd.to_numeric)

In [None]:
S0101_data.head()

In [None]:
S0101_data.shape

In [None]:
def merge_columns_by_add(df, target, columns):
    df = df.copy()
    highest_index = int(target[-3:-1])   
    target_ser = df[target].copy()
    columns.pop(columns.index(target))
    
    for i in columns:
        
        if int(i[-3:-1]) > highest_index:
            
            highest_index = int(i[-3:-1])
        target_ser += df[i]
        
    df[target] = target_ser
    new_col_name = target[:-1] + '_' + str(highest_index) + 'E'
    df.drop(columns=columns, inplace=True)
    df.rename(columns={target:new_col_name}, inplace=True)
    return df        

In [None]:
new = merge_columns_by_add(S0101_data, 'S0101_C01_007E', ['S0101_C01_007E', 'S0101_C01_008E'])

In [None]:
new.S0101_C01_007_8E.equals(checker_series.apply(pd.to_numeric).sum(axis=1))