### Process LODES Workplace Area Characteristics (WAC) data
#### Tennesse and Georiga for Number of Jobs by Block Groups in Chattanooga Model Area

In [1]:
import os
import sys
import gzip

import numpy as np
import pandas as pd
import geopandas as gpd

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Folders and filenames
# Treated as global variables
ctpp_fldr = os.path.join('..', '..', '..', 'data', 'CTPP')
RPA_tracts_file = os.path.join(ctpp_fldr, 'Tracts_inRPA.csv')

lodes_fldr = os.path.join('..', '..', '..', 'data', 'LODES')
lodes_wac_files = [
    "ga_wac_SI01_JT00_2018.csv.gz",
    "ga_wac_SI02_JT00_2018.csv.gz",
    "ga_wac_SI03_JT00_2018.csv.gz",
    "tn_wac_SI01_JT00_2018.csv.gz",
    "tn_wac_SI02_JT00_2018.csv.gz",
    "tn_wac_SI03_JT00_2018.csv.gz"
]

In [3]:
# Source: 2018 LODES7 data product Workplace Area Characteristics summaries available at census block group level
# Website: "https://lehd.ces.census.gov/data/#lodes"
#
# Data dictionaries for scripting

LODES_EMP_CATEGORIES_WAC= {
    "CNS01": "NAICS sector 11 (Agriculture, Forestry, Fishing and Hunting)",
    "CNS02": "NAICS sector 21 (Mining, Quarrying, and Oil and Gas Extraction)",
    "CNS03": "NAICS sector 22 (Utilities)",
    "CNS04": "NAICS sector 23 (Construction)",
    "CNS05": "NAICS sector 31-33 (Manufacturing)",
    "CNS06": "NAICS sector 42 (Wholesale Trade)",
    "CNS07": "NAICS sector 44-45 (Retail Trade)",
    "CNS08": "NAICS sector 48-49 (Transportation and Warehousing)",
    "CNS09": "NAICS sector 51 (Information)",
    "CNS10": "NAICS sector 52 (Finance and Insurance)",
    "CNS11": "NAICS sector 53 (Real Estate and Rental and Leasing)",
    "CNS12": "NAICS sector 54 (Professional, Scientific, and Technical Services)",
    "CNS13": "NAICS sector 55 (Management of Companies and Enterprises)",
    "CNS14": "NAICS sector 56 (Administrative and Support and Waste Management and Remediation Services)",
    "CNS15": "NAICS sector 61 (Educational Services)",
    "CNS16": "NAICS sector 62 (Health Care and Social Assistance)",
    "CNS17": "NAICS sector 71 (Arts, Entertainment, and Recreation)",
    "CNS18": "NAICS sector 72 (Accommodation and Food Services)",
    "CNS19": "NAICS sector 81 (Other Services [except Public Administration])",
    "CNS20": "NAICS sector 92 (Public Administration)",
}

CHC_EMP_CATEGORIES = {
    "empfoo_p": "food employment (NAICS: 72)",
    "empgov_p": "government employment (NAICS: 92)",
    "empind_p": "industrial employment (NAICS: 22,31-33, 42, 48-49)",
    "empmed_p": "medical employment (NAICS: 62)",
    "empofc_p": "office employment (NAICS: 51-56)",
    "empret_p": "retail employment (NAICS: 44-45)",
    "empsvc_p": "service employment (NAICS: 71, 81)",
    "emptrn_p": "transportation employment (NAICS: 42, 48-49)*",
    "empoth_p": "other employment (NAICS: 11, 21, 23)",
    "empedu_p": "educational employment (NAICS: 61)",   
}

CHC_EMP_TO_LODES_EMP = {
    "empfoo_p": ['CNS18'],
    "empgov_p": ['CNS20'],
    "empind_p": ['CNS03','CNS05', 'CNS06', 'CNS08'],
    "empmed_p": ['CNS16'],
    "empofc_p": ['CNS09','CNS10', 'CNS11', 'CNS12', 'CNS13', 'CNS14'],
    "empret_p": ['CNS07'],
    "empsvc_p": ['CNS17', 'CNS19'],
    "empoth_p": ['CNS01','CNS02', 'CNS04'],
    "empedu_p": ['CNS15']
}

# Transportation sector employment overlaps with Industrical sector employment
# So its not an exclusive category.
# Do not count transporation towards the total employment.
trans = {
    "emptrn_p": ['CNS06', 'CNS08']
}

In [4]:
tracts = pd.read_csv(RPA_tracts_file, dtype={'GEOID': str})
rta_tracts = list(tracts['GEOID'])
len(rta_tracts)

103

In [6]:
# Functon to read each of the downloaded state files, combine them and keep only tracts that touch the CHC RTA
def get_wac_jobs_chc(tracts):
    """
    Function to reach the tract files 
    """
    cols = pd.DataFrame()
    for wac_file in lodes_wac_files:
        file_path_tmp = os.path.join(lodes_fldr, wac_file)
        with gzip.open(file_path_tmp, 'rt') as fin:
            #line_as_list = fin.readline().strip().split(",")
            df = pd.read_csv(fin)#, index_col = 'w_geocode', dtype='int64')

            df['w_geo_tract'] = df['w_geocode'].astype('str').str.slice(stop=-4)
            valid_tracts =  df['w_geo_tract'].isin(tracts)
            
            cols_to_keep = list(LODES_EMP_CATEGORIES_WAC.keys()) + ['CS01', 'CS02']

            df = df.loc[valid_tracts, ['w_geocode'] + cols_to_keep]
           
            if len(cols) == 0:
                cols = df
            else:
                # Stacking the rows, since the valid blocks from various files may not be the same set
                cols = pd.concat([cols, df], axis=0)
                
        jobs_wac = cols.groupby('w_geocode').agg('sum')
        
    return jobs_wac
            
# Read the downloaded LODES WAC files and create a single dataframe of valid block groups X jobs by NAICS categories
colsums = get_wac_jobs_chc(rta_tracts)

# QC check:
# Assert that the total employment summed from NAICS codes is same as
#  total employment summed from gender category.
assert colsums.filter(like="CNS").sum().sum() == colsums.filter(like="CS").sum().sum()
print("Total 2018 jobs in the Chattanooga RTP area = {:,.0f}".format(colsums.filter(like="CNS").sum().sum()))

Total 2018 jobs in the Chattanooga RTP area = 228,785


In [7]:
# Combine LODES NAICS employment categories for CHC Model employment categories

# For each CHC employment category create and calculated a field in the dataframe 
for k, v in CHC_EMP_TO_LODES_EMP.items():
    colsums[k] = colsums[v].aggregate('sum', axis=1)

# Calculate total jobs and transporation jobs 
colsums['empTotal'] = colsums[list(CHC_EMP_TO_LODES_EMP.keys())].aggregate('sum', axis=1)
colsums['emptrn_p'] = colsums[['CNS06', 'CNS08']].aggregate('sum', axis=1)

In [12]:
# Add 'state,' 'county, 'tract,' and 'block_group' fields
#   and reorder the columns

for_client = colsums.filter(like="emp").copy()

state_FIPS = {
    '13': 'Georgia',
    '47': 'Tennesse'
}
county_FIPS = {
    '13047': 'Catoosa County',
    '13083': 'Dade County',
    '13295': 'Walker County',
    '47065': 'Hamilton County',
}

for_client['state'] = for_client.index.to_series().apply(lambda x: state_FIPS[str(x)[:2]])
for_client['county'] = for_client.index.to_series().apply(lambda x: county_FIPS[str(x)[:5]])
for_client['tract'] = for_client.index.to_series().apply(lambda x: str(x)[5:11])
for_client['block_group'] = for_client.index.to_series().apply(lambda x: str(x)[11:])

reorder_cols = ['state', 'county', 'tract', 'block_group'] + list(CHC_EMP_TO_LODES_EMP.keys()) + ['empTotal', 'emptrn_p']

output_file = os.path.join(lodes_fldr, "2018_LODES_Emp_by_Category.csv")


for_client = for_client.reindex(columns = reorder_cols)
for_client.to_csv(output_file, columns = reorder_cols, )
for_client

Unnamed: 0_level_0,state,county,tract,block_group,empfoo_p,empgov_p,empind_p,empmed_p,empofc_p,empret_p,empsvc_p,empoth_p,empedu_p,empTotal,emptrn_p
w_geocode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
130470301001004,Georgia,Catoosa County,030100,1004,37,0,0,0,0,0,0,0,0,37,0
130470301001010,Georgia,Catoosa County,030100,1010,0,0,0,9,0,0,0,0,0,9,0
130470301001012,Georgia,Catoosa County,030100,1012,0,0,3,0,0,0,0,5,0,8,0
130470301001014,Georgia,Catoosa County,030100,1014,0,0,0,0,0,0,0,3,0,3,0
130470301001016,Georgia,Catoosa County,030100,1016,0,0,0,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
470659802001126,Tennesse,Hamilton County,980200,1126,0,0,0,0,522,0,0,0,0,522,0
470659802001127,Tennesse,Hamilton County,980200,1127,0,0,0,2,8,0,0,0,0,10,0
470659802001149,Tennesse,Hamilton County,980200,1149,0,0,1,0,0,0,0,0,0,1,0
470659802001151,Tennesse,Hamilton County,980200,1151,32,0,1878,0,89,0,0,9,0,2008,1878


### Summaries of total number of jobs by CHC employment type and LODES NAICS categories for QC

In [13]:
colsums[list(CHC_EMP_TO_LODES_EMP.keys())].sum()

empfoo_p    23157
empgov_p     6939
empind_p    54322
empmed_p    35393
empofc_p    47967
empret_p    23827
empsvc_p    10763
empoth_p    11242
empedu_p    15175
dtype: int64

In [14]:
colsums.filter(like='CNS').sum()

CNS01      303
CNS02      115
CNS03     1212
CNS04    10824
CNS05    28343
CNS06     8067
CNS07    23827
CNS08    16700
CNS09     3864
CNS10    11150
CNS11     3074
CNS12     9218
CNS13     4258
CNS14    16403
CNS15    15175
CNS16    35393
CNS17     3158
CNS18    23157
CNS19     7605
CNS20     6939
dtype: int64

In [15]:
sum_chc_emp = colsums[list(CHC_EMP_TO_LODES_EMP.keys())].sum().sum()
sum_lodes_emp = colsums.filter(like='CNS').sum().sum()
assert sum_chc_emp == sum_lodes_emp