In [None]:
### SETUP

import os

import numpy as np
import pandas as pd

import statsmodels.api as statsmodels
from statsmodels.tsa.x13 import x13_arima_analysis

from warnings import simplefilter
simplefilter(action = 'ignore', category = pd.errors.PerformanceWarning)
simplefilter(action = 'ignore', category = statsmodels.tools.sm_exceptions.X13Warning)

data_dir = os.getcwd() + '/data/'

In [None]:
# read master CPS data
cps_df = pd.read_pickle(data_dir + '/cps_final.pkl')

# SVAR-IV age categories
cps_df['age_cat'] = np.where(cps_df['age_cat'] == 1, 1, # '16-24'
                    np.where(cps_df['age_cat'].isin([2, 3]), 2, # '25-44'
                    np.where(cps_df['age_cat'].isin([4, 5]), 3, # '45-64'
                    np.where(cps_df['age_cat'] == 6, 4, 0)))) # '65+'; 'other'

# initialize new dataframe to collect monthly data with date column indexed by date
yyyymm = cps_df['yyyymm'].drop_duplicates().sort_values()
out_df = pd.DataFrame(yyyymm).set_index('yyyymm', drop = False)

In [None]:
### GROUP MAPPING

race_cats = {
    1: 'w', # white
    2: 'b', # Black
    3: 'h', # Hispanic
}

sex_cats = {
    1: 'm', # male
    2: 'f', # female
}

age_cats = {
    1: 'a1', # 16-24
    2: 'a2', # 25-44
    3: 'a3', # 45-64
    4: 'a4', # 65+
}

educ_cats = {
    1: 'e1', # less than HS
    2: 'e2', # HS diploma
    3: 'e3', # some college
    4: 'e4', # col and higher
}

empl_cats = {
    1: 'e', # employed
    2: 'u', # unemployed
    3: 'n', # neither/other
}

In [None]:
# helper function 1
def helper1(df1, df2, lbl):
    wt_cnt = df1.groupby('yyyymm')['wtfinl'].count()
    wt_sum = df1.groupby('yyyymm')['wtfinl'].sum()
    
    df2[lbl + '_cnt'] = wt_cnt
    df2[lbl + '_sum'] = wt_sum

# helper function 2
def helper2(df, lbl):
    for i in ['_cnt', '_sum']:
        df['lf_' + lbl + i] = df['e_' + lbl + i] + df['u_' + lbl + i]
        df['pop_' + lbl + i] = df['e_' + lbl + i] + df['u_' + lbl + i] + df['n_' + lbl + i]
        
        if i == '_sum':
            df['ur_' + lbl] = 100 * df['u_' + lbl + i] / df['lf_' + lbl + i]
            df['epr_' + lbl] = 100 * df['e_' + lbl + i] / df['pop_' + lbl + i]

In [None]:
# Employment Type Counts and Weighted Sums by Race
for empl in list(empl_cats.keys()):
    for race in list(race_cats.keys()):
        grp_lbl = empl_cats[empl] + '_' + race_cats[race]
        
        df = cps_df[(cps_df['empl_cat'] == empl) & (cps_df['race_cat'] == race)]
        
        helper1(df, out_df, grp_lbl)


# Employment Type Counts and Weighted Sums by Race and Sex
for empl in list(empl_cats.keys()):
    for race in list(race_cats.keys()):
        for sex in list(sex_cats.keys()):
            grp_lbl = empl_cats[empl] + '_' + race_cats[race] + '_' + sex_cats[sex]
            
            df = cps_df[(cps_df['empl_cat'] == empl) & (cps_df['race_cat'] == race) & (cps_df['sex_cat'] == sex)]
            
            helper1(df, out_df, grp_lbl)


# Employment Type Counts and Weighted Sums by Race, Sex, and Age
for empl in list(empl_cats.keys()):
    for race in list(race_cats.keys()):
        for sex in list(sex_cats.keys()):
            for age in list(age_cats.keys()):
                grp_lbl = empl_cats[empl] + '_' + race_cats[race] + '_' + sex_cats[sex] + '_' + age_cats[age]
                
                df = cps_df[(cps_df['empl_cat'] == empl) & (cps_df['race_cat'] == race) & (cps_df['sex_cat'] == sex) & (cps_df['age_cat'] == age)]
                
                helper1(df, out_df, grp_lbl)


# Employment Type Counts and Weighted Sums by Race, Sex, Age, and Education
for empl in list(empl_cats.keys()):
    for race in list(race_cats.keys()):
        for sex in list(sex_cats.keys()):
            for age in [2, 3]:
                for educ in list(educ_cats.keys()):
                    grp_lbl = empl_cats[empl] + '_' + race_cats[race] + '_' + sex_cats[sex] + '_' + age_cats[age] + '_' + educ_cats[educ]
                    
                    df = cps_df[(cps_df['empl_cat'] == empl) & (cps_df['race_cat'] == race) & (cps_df['sex_cat'] == sex) & (cps_df['age_cat'] == age) & (cps_df['educ_cat'] == educ)]
                    
                    helper1(df, out_df, grp_lbl)

In [None]:
# fill empty counts with zeros
out_df = out_df.fillna(0)

# Labor Market Indicators by Race
for race in list(race_cats.keys()):
    grp_lbl = race_cats[race]
    
    helper2(out_df, grp_lbl)


# Labor Market Indicators by Race and Sex
for race in list(race_cats.keys()):
    for sex in list(sex_cats.keys()):
        grp_lbl = race_cats[race] + '_' + sex_cats[sex]
        
        helper2(out_df, grp_lbl)


# Labor Market Indicators by Race, Sex, and Age
for race in list(race_cats.keys()):
    for sex in list(sex_cats.keys()):
        for age in list(age_cats.keys()):
            grp_lbl = race_cats[race] + '_' + sex_cats[sex] + '_' + age_cats[age]
            
            helper2(out_df, grp_lbl)


# Labor Market Indicators by Race, Sex, Age, and Education
for race in list(race_cats.keys()):
    for sex in list(sex_cats.keys()):
        for age in [2, 3]:
            for educ in list(educ_cats.keys()):
                grp_lbl = race_cats[race] + '_' + sex_cats[sex] + '_' + age_cats[age] + '_' + educ_cats[educ]
                
                helper2(out_df, grp_lbl)

In [None]:
# Date Indexing for Seasonal Adjustment Procedure
out_df['y'] = out_df['yyyymm'].astype(str).str[:4]
out_df['m'] = out_df['yyyymm'].astype(str).str[-2:]

out_df['date'] = pd.to_datetime(out_df['y'] + '-' + out_df['m']  + '-01')
out_df = out_df.set_index('date', drop = False)

out_df['y'] = out_df['y'].astype(int)
out_df['m'] = out_df['m'].astype(int)

# Seasonally Adjust UR and EPR Indicators
for c in list(out_df.columns):
    if ('ur_' in c) or ('epr_' in c):
        try:
            out_df['sa_' + c] = x13_arima_analysis(out_df[c], prefer_x13 = True).seasadj
        except Exception as e:
            out_df['sa_' + c] = out_df[c]

# Create Labor Market Gaps
for i in ['_ur', '_epr']:
    for j in ['_b', '_h']:
        for c in list(out_df.columns):
            if (i in c) and (j in c):
                new_lbl = c.replace(i, i + 'g').replace(j, j + 'w').replace('sa_', '')
                w_lbl = c.replace(j, '_w')
                
                out_df[new_lbl] = out_df[c] - out_df[w_lbl]

# Save Labor Market Gaps Data
gaps_list = [col for col in out_df.columns if ('urg' in col) or ('eprg' in col)]
out_df[['date', 'yyyymm', 'y', 'm'] + gaps_list].to_csv(os.path.dirname(os.getcwd()) + '/MATLAB/data/labor_gaps_data.csv', index = False)

In [None]:
# Obtain Labor Force and Population Sample Sizes in Latex Table Format, Age Regs

latex_lbl = ''
cnt = 0

for i in ['lf_', 'pop_']:
    if i == 'lf_':
        latex_lbl += 'labor force & '
    else:
        latex_lbl += 'population  & '
    
    for j in ['m_', 'f_']:
        if j == 'm_':
            latex_lbl += 'male   & '
        else:
            latex_lbl += 'female & '
        
        for k in ['a1_', 'a2_', 'a3_', 'a4_']:
            if k == 'a1_':
                latex_lbl += '16-24 & '
            elif k == 'a2_':
                latex_lbl += '25-44 & '
            elif k == 'a3_':
                latex_lbl += '45-64 & '
            else:
                latex_lbl += '65+   & '
            
            w_lbl = i + 'w_' + j + k + 'cnt'
            b_lbl = i + 'b_' + j + k + 'cnt'
            h_lbl = i + 'h_' + j + k + 'cnt'
            
            latex_lbl += '{:,}'.format(round(out_df[w_lbl].min())) + ' & ' + '{:,}'.format(round(out_df[w_lbl].mean())) + ' & ' + '{:,}'.format(round(out_df[w_lbl].max())) + ' & '
            latex_lbl += '{:,}'.format(round(out_df[b_lbl].min())) + ' & ' + '{:,}'.format(round(out_df[b_lbl].mean())) + ' & ' + '{:,}'.format(round(out_df[b_lbl].max())) + ' & '
            latex_lbl += '{:,}'.format(round(out_df[h_lbl].min())) + ' & ' + '{:,}'.format(round(out_df[h_lbl].mean())) + ' & ' + '{:,}'.format(round(out_df[h_lbl].max())) + ' \\\ ' + os.linesep
            
            cnt += 1
            
            if cnt not in [8, 16]:
                if i == 'lf_':
                    latex_lbl += 'labor force & '
                else:
                    latex_lbl += 'population  & '
            
            if cnt not in [4, 8, 12, 16]:
                if j == 'm_':
                    latex_lbl += 'male   & '
                else:
                    latex_lbl += 'female & '

print(latex_lbl)

In [None]:
# Obtain Labor Force and Population Sample Sizes in Latex Table Format, Age-Educ Regs

latex_lbl = ''
cnt = 0

for i in ['lf_', 'pop_']:
    if i == 'lf_':
        latex_lbl += 'labor force & '
    else:
        latex_lbl += 'population  & '
    
    for j in ['m_', 'f_']:
        if j == 'm_':
            latex_lbl += 'male   & '
        else:
            latex_lbl += 'female & '
        
        for k in ['a2_', 'a3_']:
            if k == 'a2_':
                latex_lbl += '25-44 & '
            else:
                latex_lbl += '45-64 & '
            
            for l in ['e1_', 'e2_', 'e3_', 'e4_']:
                if l == 'e1_':
                    latex_lbl += 'less than HS   & '
                elif l == 'e2_':
                    latex_lbl += 'HS diploma     & '
                elif l == 'e3_':
                    latex_lbl += 'some college   & '
                else:
                    latex_lbl += 'col and higher & '
                
                w_lbl = i + 'w_' + j + k + l + 'cnt'
                b_lbl = i + 'b_' + j + k + l + 'cnt'
                h_lbl = i + 'h_' + j + k + l + 'cnt'
                
                latex_lbl += '{:,}'.format(round(out_df[w_lbl].min())) + ' & ' + '{:,}'.format(round(out_df[w_lbl].mean())) + ' & ' + '{:,}'.format(round(out_df[w_lbl].max())) + ' & '
                latex_lbl += '{:,}'.format(round(out_df[b_lbl].min())) + ' & ' + '{:,}'.format(round(out_df[b_lbl].mean())) + ' & ' + '{:,}'.format(round(out_df[b_lbl].max())) + ' & '
                latex_lbl += '{:,}'.format(round(out_df[h_lbl].min())) + ' & ' + '{:,}'.format(round(out_df[h_lbl].mean())) + ' & ' + '{:,}'.format(round(out_df[h_lbl].max())) + ' \\\ ' + os.linesep
                
                cnt += 1
                
                if cnt not in [16, 32]:
                    if i == 'lf_':
                        latex_lbl += 'labor force & '
                    else:
                        latex_lbl += 'population  & '
                
                if cnt not in [8, 16, 24, 32]:
                    if j == 'm_':
                        latex_lbl += 'male   & '
                    else:
                        latex_lbl += 'female & '
                
                if cnt not in [4, 8, 12, 16, 20, 24, 28, 32]:
                    if k == 'a2_':
                        latex_lbl += '25-44 & '
                    else:
                        latex_lbl += '45-64 & '

print(latex_lbl)