In [None]:
### SETUP

import os

from time import time
from datetime import timedelta

import numpy as np

import pandas as pd
pd.options.mode.chained_assignment = None

import warnings
warnings.simplefilter(action = 'ignore', category = pd.errors.PerformanceWarning)

data_dir = os.getcwd() + '/data/'

In [None]:
### DEFINE GROUP MAPS

race_cats = {
    1: 'w', # white
    2: 'b', # Black
    3: 'h', # Hispanic
}

sex_cats = {
    1: 'm', # male
    2: 'f', # female
}

age_cats = {
    1: 'a1', # 16-24
    2: 'a2', # 25-34
    3: 'a3', # 35-44
    4: 'a4', # 45-54
    5: 'a5', # 55-64
    6: 'a6', # 65+
}

educ_cats = {
    1: 'e1', # less than HS
    2: 'e2', # HS diploma
    3: 'e3', # some college
    4: 'e4', # col and higher
}

occ_cats = {
    0: 'miss',
    1: 'prof',
    2: 'tech',
    3: 'serv',
    4: 'farm',
    5: 'prod',
    6: 'oper',
}

ind_cats = {
    0: 'miss',
    1: 'agri',
    2: 'mine',
    3: 'cnst',
    4: 'manu',
    5: 'trns',
    6: 'whol',
    7: 'retl',
    8: 'fire',
    9: 'busi',
    10: 'pers',
    11: 'entr',
    12: 'prof',
    13: 'publ',
}

In [None]:
### LOAD DATA AND SET REGRESSION FEATURES

cps_df = pd.read_pickle(data_dir + '/cps_final.pkl')

outcome = 'unemployed'
macro_var = 'sur_sa_1y_avg'

age_dummies = ['age_cat_' + age_cats[i] for i in sorted(cps_df['age_cat'].unique()) if i != 3]
educ_dummies = ['educ_cat_' + educ_cats[i] for i in sorted(cps_df['educ_cat'].unique()) if i != 2]

occ_dummies = ['occ_cat_' + occ_cats[i] for i in sorted(cps_df['occ_cat'].unique()) if i >= 2]
ind_dummies = ['ind_cat_' + ind_cats[i] for i in sorted(cps_df['ind_cat'].unique()) if i != 0]

state_dummies = ['state_' + i for i in sorted(cps_df['state'].unique()) if i != 'ca']
month_dummies = ['month_' + str(i) for i in sorted(cps_df['month'].unique()) if i != 1]

addl_dummies = ['married', 'veteran', 'urban']
main_dummies = age_dummies + educ_dummies + occ_dummies + ind_dummies + addl_dummies

features = [macro_var] + main_dummies + [macro_var + '_' + i for i in main_dummies] + state_dummies + month_dummies
reg_vars = [outcome] + features + ['weight']

In [None]:
### STORE OAXACA-BLINDER DATA

t0 = time()

labels = []

for race in list(race_cats.keys()):
    for sex in list(sex_cats.keys()):
        t1 = time()
        
        label = outcome + '_' + macro_var + '_' + race_cats[race] + '_' + sex_cats[sex]
        labels.append(label)
        
        est_df = cps_df[(cps_df['empl_cat'] != 3) & (cps_df['race_cat'] == race) & (cps_df['sex_cat'] == sex)]
        
        est_df.to_pickle(data_dir + '/cps_temp.pkl')
        est_df = pd.read_pickle(data_dir + '/cps_temp.pkl')
        
        est_df['weight'] = est_df['wtfinl'] * est_df['wtfinl'].count() / est_df['wtfinl'].sum()
        
        for i in sorted(est_df['age_cat'].unique()):
            est_df['age_cat_' + age_cats[i]] = np.where(est_df['age_cat'] == i, 1, 0).astype(np.int8)
        
        for i in sorted(est_df['educ_cat'].unique()):
            est_df['educ_cat_' + educ_cats[i]] = np.where(est_df['educ_cat'] == i, 1, 0).astype(np.int8)
        
        for i in sorted(est_df['occ_cat'].unique()):
            est_df['occ_cat_' + occ_cats[i]] = np.where(est_df['occ_cat'] == i, 1, 0).astype(np.int8)
        
        for i in sorted(est_df['ind_cat'].unique()):
            est_df['ind_cat_' + ind_cats[i]] = np.where(est_df['ind_cat'] == i, 1, 0).astype(np.int8)
        
        est_df.to_pickle(data_dir + '/cps_temp.pkl')
        est_df = pd.read_pickle(data_dir + '/cps_temp.pkl')
        
        for i in main_dummies:
            est_df[macro_var + '_' + i] = est_df[macro_var] * est_df[i]
        
        est_df.to_pickle(data_dir + '/cps_temp.pkl')
        est_df = pd.read_pickle(data_dir + '/cps_temp.pkl')
        
        for i in sorted(est_df['state'].unique()):
            est_df['state_' + i] = np.where(est_df['state'] == i, 1, 0).astype(np.int8)
        
        for i in sorted(est_df['month'].unique()):
            est_df['month_' + str(i)] = np.where(est_df['month'] == i, 1, 0).astype(np.int8)
        
        est_df[reg_vars].to_csv(os.path.dirname(os.getcwd()) + '/Stata/ob_data/' + label + '.csv', index = False)
        
        print(label + ' time: ' + str(timedelta(seconds = round(time() - t1))))

print('total time:', timedelta(seconds = round(time() - t0)))