In [None]:
### SETUP

import os

from time import time
from datetime import timedelta

import numpy as np
import statsmodels.api as sm

from scipy.stats import t

import pandas as pd
pd.options.mode.chained_assignment = None

import warnings
warnings.simplefilter(action = 'ignore', category = pd.errors.PerformanceWarning)

data_dir = os.getcwd() + '/data/'

In [None]:
### DEFINE GROUP MAPS

race_cats = {
    2: 'b', # Black
    3: 'h', # Hispanic
}

sex_cats = {
    1: 'm', # male
    2: 'f', # female
}

age_cats = {
    1: 'a1', # 16-24
    2: 'a2', # 25-34
    3: 'a3', # 35-44
    4: 'a4', # 45-54
    5: 'a5', # 55-64
    6: 'a6', # 65+
}

educ_cats = {
    1: 'e1', # less than HS
    2: 'e2', # HS diploma
    3: 'e3', # some college
    4: 'e4', # col and higher
}

occ_cats = {
    0: 'miss',
    1: 'prof',
    2: 'tech',
    3: 'serv',
    4: 'farm',
    5: 'prod',
    6: 'oper',
}

ind_cats = {
    0: 'miss',
    1: 'agri',
    2: 'mine',
    3: 'cnst',
    4: 'manu',
    5: 'trns',
    6: 'whol',
    7: 'retl',
    8: 'fire',
    9: 'busi',
    10: 'pers',
    11: 'entr',
    12: 'prof',
    13: 'publ',
}

In [None]:
### LOAD DATA AND SET REGRESSION FEATURES

cps_df = pd.read_pickle(data_dir + '/cps_final.pkl')

outcome = 'unemployed'
group = 'nonwhite'

macro_var = 'sur_sa_1y_avg'
collect_var = macro_var + '_' + group

occ_dummies = ['occ_cat_' + occ_cats[i] for i in sorted(cps_df['occ_cat'].unique()) if i >= 2]
ind_dummies = ['ind_cat_' + ind_cats[i] for i in sorted(cps_df['ind_cat'].unique()) if i != 0]

state_dummies = ['state_' + i for i in sorted(cps_df['state'].unique()) if i != 'ca']
month_dummies = ['month_' + str(i) for i in sorted(cps_df['month'].unique()) if i != 1]

addl_dummies = ['married', 'veteran', 'urban']
all_dummies = occ_dummies + ind_dummies + addl_dummies + state_dummies + month_dummies

features = [group, macro_var] + all_dummies + [i + '_' + group for i in [macro_var] + all_dummies] + ['cons']
reg_vars = [outcome] + features + ['weight']

In [None]:
### COLLECT MARGINAL-EFFECTS LPM ESTIMATES

t0 = time()

labels = []
spl_sizes = {}
unit_cnts = {}

wls_coefs = {}
wls_std_errs = {}
t_vals = {}

conf_lvl = 0.99

for race in list(race_cats.keys()):
    for sex in list(sex_cats.keys()):
        for age in list(age_cats.keys()):
            for educ in list(educ_cats.keys()):
                t1 = time()
                
                label = outcome + '_' + collect_var + '_' + race_cats[race] + '_' + sex_cats[sex] + '_' + age_cats[age] + '_' + educ_cats[educ]
                labels.append(label)
                
                est_df = cps_df[(cps_df['empl_cat'] != 3) & (cps_df['race_cat'].isin([1, race])) & (cps_df['sex_cat'] == sex) & (cps_df['age_cat'] == age) & (cps_df['educ_cat'] == educ)]
                
                est_df.to_pickle(data_dir + '/cps_temp.pkl')
                est_df = pd.read_pickle(data_dir + '/cps_temp.pkl')
                
                est_df['weight'] = est_df['wtfinl'] * est_df['wtfinl'].count() / est_df['wtfinl'].sum()
                
                est_df['cons'] = 1
                est_df['cons'] = est_df['cons'].astype(np.int8)
                
                for i in sorted(est_df['occ_cat'].unique()):
                    est_df['occ_cat_' + occ_cats[i]] = np.where(est_df['occ_cat'] == i, 1, 0).astype(np.int8)
                
                for i in sorted(est_df['ind_cat'].unique()):
                    est_df['ind_cat_' + ind_cats[i]] = np.where(est_df['ind_cat'] == i, 1, 0).astype(np.int8)
                
                for i in sorted(est_df['state'].unique()):
                    est_df['state_' + i] = np.where(est_df['state'] == i, 1, 0).astype(np.int8)
                
                for i in sorted(est_df['month'].unique()):
                    est_df['month_' + str(i)] = np.where(est_df['month'] == i, 1, 0).astype(np.int8)
                
                est_df.to_pickle(data_dir + '/cps_temp.pkl')
                est_df = pd.read_pickle(data_dir + '/cps_temp.pkl')
                
                est_df[macro_var + '_' + group] = est_df[macro_var] * est_df[group]
                
                for i in all_dummies:
                    est_df[i + '_' + group] = (est_df[i] * est_df[group]).astype(np.int8)
                
                est_df[reg_vars].to_pickle(data_dir + '/cps_temp.pkl')
                est_df = pd.read_pickle(data_dir + '/cps_temp.pkl')
                
                spl_sizes[label] = int(est_df.shape[0])
                t_vals[label] = float(t.ppf(1 - (1 - conf_lvl) / 2, spl_sizes[label]))
                
                try:
                    wls_model = sm.WLS(endog = est_df[outcome], exog = est_df[features], weights = est_df['weight']).fit()
                    
                    wls_coefs[label] = float(wls_model.params[collect_var])
                    wls_std_errs[label] = float(max(wls_model.bse[collect_var], wls_model.HC0_se[collect_var], wls_model.HC1_se[collect_var],
                                                    np.nan_to_num(wls_model.HC2_se[collect_var], posinf = 0, neginf = 0),
                                                    np.nan_to_num(wls_model.HC3_se[collect_var], posinf = 0, neginf = 0)))
                    
                    est_df[outcome + '_fit'] = sum([est_df[i] * float(wls_model.params[i]) for i in features])
                    unit_cnts[label] = int(est_df[outcome + '_fit'][(est_df[outcome + '_fit'] >= 0) & (est_df[outcome + '_fit'] <= 1)].count())
                except:
                    wls_coefs[label] = 0.0
                    wls_std_errs[label] = 0.0
                    unit_cnts[label] = 0
                
                print(label + ' time: ' + str(timedelta(seconds = round(time() - t1))))

print('total time:', timedelta(seconds = round(time() - t0)))

In [None]:
### PRINT MARGINAL-EFFECTS LPM ESTIMATES IN LATEX FORMAT

for age in list(age_cats.keys()):
    for educ in list(educ_cats.keys()):
        rep = [age_cats[age], educ_cats[educ]]
        
        for race in list(race_cats.keys()):
            for sex in list(sex_cats.keys()):
                lbl = outcome + '_' + collect_var + '_' + race_cats[race] + '_' + sex_cats[sex] + '_' + age_cats[age] + '_' + educ_cats[educ]
                
                rep.append(round(100 * wls_coefs[lbl], 4))
                rep.append('(' + str(round(100 * wls_std_errs[lbl], 4)) + ')')
        
        print(*rep, sep = ' & ')

In [None]:
### PRINT MARGINAL-EFFECTS LPM UNIT/OBS COUNTS IN LATEX FORMAT

for age in list(age_cats.keys()):
    for educ in list(educ_cats.keys()):
        rep = [age_cats[age], educ_cats[educ]]
        
        for race in list(race_cats.keys()):
            for sex in list(sex_cats.keys()):
                lbl = outcome + '_' + collect_var + '_' + race_cats[race] + '_' + sex_cats[sex] + '_' + age_cats[age] + '_' + educ_cats[educ]
                
                rep.append('{:,}'.format(spl_sizes[lbl]))
                rep.append('{:,}'.format(unit_cnts[lbl]))
                rep.append(str(round(100 * unit_cnts[lbl] / spl_sizes[lbl], 2)))
        
        print(*rep, sep = ' & ')

In [None]:
### SAVE MARGINAL-EFFECTS LPM ESTIMATES

me_df = pd.DataFrame(data = {
    'label': labels,
    
    'num_obs': list(spl_sizes.values()),
    'unit_cnt': list(unit_cnts.values()),
    
    collect_var + '_wls_coef': list(wls_coefs.values()),
    collect_var + '_wls_std_err': list(wls_std_errs.values()),
    
    't_val': list(t_vals.values()),
})

me_df.to_csv(os.getcwd() + '/results/me_lpm_estimates.csv', index = False)