In [1]:
import pandas as pd
import numpy as np

from utilities import *

# Firm dataset

In [2]:
# Read in all the data files
data_firm = pd.read_stata('Data/Intermediate/data_firm.dta')
data_fred = pd.read_stata('Data/Intermediate/fred_mapped.dta')
data_BDSe = pd.read_stata('Data/Intermediate/entry_out.dta')
data_BEA = pd.read_stata('Data/Intermediate/BEA_industry.dta')
data_licn = pd.read_stata('Data/Intermediate/license_out.dta')
data_cens = pd.read_stata('Data/Intermediate/cencon_out.dta')
data_spread = pd.read_stata('Data/Intermediate/spread_data.dta')
data_reg = pd.read_stata('Data/Intermediate/regindex_out.dta')
data_hhi = pd.read_stata('Data/Intermediate/mod_HHI_BEA.dta')

# Perform all the merges
data = data_firm.merge(data_fred,   how = 'outer', on = 'year')\
                .merge(data_BDSe,   how = 'outer', on = 'year')\
                .merge(data_BEA,    how = 'outer', on = ['year', 'indcode'])\
                .merge(data_licn,   how = 'outer', on = 'indcode')\
                .merge(data_cens,   how = 'outer', on = ['year', 'indcode'])\
                .merge(data_spread, how = 'outer', on = ['year', 'indcode'])\
                .merge(data_reg,    how = 'outer', on = ['year', 'indcode'])\
                .merge(data_hhi,    how = 'outer', on = ['year', 'indcode'])\

# Rename stuff (we are skipping adding, as we have already done it)
data.rename(columns = {'herf_mod': 'mherf', 'herf_votmod': 'mherf_vot'}, inplace = True)

# Herfs
data['herf_adj'] = data['mherf'] - data['herf_s']

# Just drop indcode or gvkey NAa
data.dropna(subset = ['gvkey', 'indcode'], inplace = True)

# Order data
data.sort_values(['indcode', 'year'], inplace = True)

# Save temporarily firm dataset
data_firm = data.copy()

In [3]:
# Get count for industry/year
data['a1_count'] = data.groupby(['indcode', 'year'])['gvkey'].transform('count')

# Drop duplicates (industry only)
data = data.drop_duplicates(subset = ['indcode', 'year']).drop(columns = ['gvkey'])\
           .reset_index(drop = True)

# Order data
data.sort_values(['indcode', 'year'], inplace = True)

In [4]:
# Some columns which will be used to ensure proper lag shifting/conditions
count_ok = data['a1_count'] > 5
shift_ok = data['indcode'] == data['indcode'].shift(1) 

# Current columns
columns_curr = data.columns

## Industry quantities

In [5]:
data.eval('''
a1_q = a1_mv/a1_at 
a1_qadj = a1_mvadj/a1_atadj
a1_ca = a1_che/a1_at 
a1_blev = a1_bliab/a1_at
a1_paya  = a1_pay/a1_at
a1_payos  = a1_pay/a1_os_cp
a1_bba  = a1_bb/a1_at 
a1_bbos  = a1_bb/a1_os_cp
a1_xrdat  = a1_xrd/a1_at
a1_gwa = a1_gdwl/a1_at 
a1_intanat = a1_intan/a1_at
a1_intanexgwat = (a1_intan-a1_gdwl)/a1_at 
a1_nblev = (a1_bliab - a1_che)/a1_at 
a1_txdba = a1_txdb/a1_at

a1_osk_cp = a1_os_cp/a1_kdef1 
''', inplace = True)

columns_all = data.columns
columns_new = list(set(columns_all) - set(columns_curr))
data.loc[~shift_ok, columns_new] = None

In [6]:
data.eval('''
a1_cfat = a1_cf/a1_at.shift(1) 
a1_cfk1 = a1_cf/a1_kdef1.shift(1)
a1_cfk2 = a1_cf/a1_kdef2.shift(1)

a1_ik1 = a1_inv1/a1_kdef1.shift(1)
a1_ik3 = a1_inv3/a1_kdef3.shift(1)
a1_ik4 = a1_inv4/a1_kdef4.shift(1)
a1_ik5 = a1_inv5/a1_kdef5.shift(1)

a1_nik1 = (a1_inv1-a1_dp1)/a1_kdef1.shift(1) 
a1_nik2 = (a1_inv2-a1_dp2)/a1_kdef2.shift(1) 
a1_nik3 = (a1_inv3-a1_dp3)/a1_kdef3.shift(1) 
a1_nik4 = (a1_inv4-a1_dp4)/a1_kdef4.shift(1) 
a1_nik5 = (a1_inv5-a1_dp5)/a1_kdef5.shift(1) 
a1_nik6 = (a1_inv6-a1_dp6)/a1_kdef6.shift(1) 
''', inplace = True)

# Get all new columns, extract just newley created ones, apply a1_count > 5
columns_all = data.columns
columns_new = list(set(columns_all) - set(columns_curr))
data.loc[~count_ok, columns_new] = None

# Reset current columns
columns_curr = columns_all

In [7]:
data.eval('''
a1_defat = a1_findef/a1_at.shift(1)
a1_diat = a1_ndebtiss/a1_at.shift(1)
a1_eiat = a1_neqiss/a1_at.shift(1)
a1_divat = a1_dv/a1_at.shift(1)
a1_invdefat = a1_inv_def/a1_at.shift(1)
a1_dwcat = a1_dnwc_def/a1_at.shift(1)

a1_ibcat = a1_ibc/a1_at.shift(1)
a1_xidocat = a1_xidoc/a1_at.shift(1)
a1_dpcat = a1_dpc/a1_at.shift(1)
a1_txdcat = a1_txdc/a1_at.shift(1)
a1_cfotherat = a1_cfother/a1_at.shift(1)
''', inplace = True)

# Make sure shifting is ok (all vars above used shifting)
columns_all = data.columns
columns_new = list(set(columns_all) - set(columns_curr))
data.loc[~shift_ok, columns_new] = None

In [8]:
data.eval('''
a1_logemp = log(a1_emp) 
a1_logq = log(a1_q)
a1_logppe = log(a1_ppe)

a1_dfpct = a1_ndebtiss/a1_findef
a1_efpct = a1_neqiss/a1_findef
a1_dfpct2 = a1_diat/a1_defat
a1_efpct2 = a1_eiat/a1_defat
a1m_dfpct2 = a1m_diat/a1m_defat
a1m_efpct2 = a1m_eiat/a1m_defat

a1_dd1d = a1_dd1/a1_ltd
a1_dd3d = a1_dd3c/a1_ltd
a1_dd5d = a1_dd5c/a1_ltd

a1_extfindep_rz = (a1_capx - a1_cf_rz) / a1_capx
a1_exteqfindep_rz = a1_neqiss / a1_capx
a1_extdebtfindep_rz = a1_ndebtiss / a1_capx
a1_pifo_sh = a1_pifo/a1_pi

a1_s3logN = a1_logN.diff(3)
''', inplace = True)

# Last var uses 3 period diff, correct for that
d3_ok = data['indcode'] == data['indcode'].shift(3)
data.loc[~d3_ok, 'a1_s3logN'] = None

In [9]:
# Some winsor stuff
vars_w = '''a1_paya a1_bba a1_intanat a1_xrdat a1_bbos a1_payos a1_defat a1_diat a1_eiat
 a1_dfpct a1_efpct a1_extfindep_rz a1_exteqfindep_rz a1_extdebtfindep_rz a1_pifo_sh'''

for var in str_to_list(vars_w):
    data[var] = data.groupby('year')[var].transform(winsor, l = 0.01, u = 0.01)

## Aggregate quantities

In [10]:
data.eval('''
a_q = a_mv/a_at
a_qadj = a_mvadj/a_atadj
a_blev = a_bliab/a_at
a_paya  = a_pay/a_at
a_bba  = a_bb/a_at
a_intanat  = a_intan/a_at
a_intanexgwat  = (a_intan-a_gdwl)/a_at 
''', inplace = True)

# Correction
data.loc[~count_ok, 'a_intanexgwat'] = None

# Reset current columns
columns_curr = columns_all

data.eval('''
a_ik1 = a_inv1/a_kdef1.shift(1) 
a_ik2 = a_inv2/a_kdef2.shift(1) 
a_niat1 = (a_inv1-a_dp1)/a_at.shift(1) 
a_nik1 = (a_inv1-a_dp1)/a_kdef1.shift(1) 
a_nik2 = (a_inv2-a_dp2)/a_kdef2.shift(1) 
a_ios_cp1 = a_inv1/a_os_cp.shift(1) 
a_ios_cp2 = a_inv2/a_os_cp.shift(1)
a_osk_cp1 = a_os_cp/a_kdef1.shift(1)
a_osk_cp2 = a_os_cp/a_kdef2.shift(1)

a_iv1 = a_inv1/a_mv.shift(1)
a_iv2 = a_inv2/a_mv.shift(1)

a_defat = a_findef/a_at.shift(1)
a_diat = a_ndebtiss/a_at.shift(1)
a_eiat = a_neqiss/a_at.shift(1)
a_divat = a_dv/a_at.shift(1)
a_invdefat = a_inv_def/a_at.shift(1)
a_dwcat = a_dnwc_def/a_at.shift(1)
''', inplace = True)

# Make sure shifting is ok (all vars above used shifting)
columns_all = data.columns
columns_new = list(set(columns_all) - set(columns_curr))
data.loc[~shift_ok, columns_new] = None

data.eval('''
a_dfpct = a_ndebtiss/a_findef
a_efpct = a_neqiss/a_findef
a_dfpct2 = a_diat/a_defat
a_efpct2 = a_eiat/a_defat
am_dfpct2 = am_diat/am_defat
am_efpct2 = am_eiat/am_defat

a_extfin = a_diat + a_eiat
am_extfin = am_diat + am_eiat
a_cdat = a_dv/a_at.shift(1)
''', inplace = True)

# Correct shifting
data.loc[~shift_ok, 'a_cdat'] = None

# Herfindals
gby_y = data.groupby('year')

data['amean_herf'] = gby_y['herf_s'].transform('mean')
data['amean_mherf'] = gby_y['mherf'].transform('mean')
data['amed_herf'] = gby_y['herf_s'].transform('median')
data['amed_mherf'] = gby_y['mherf'].transform('median')

wt_a = lambda x: pd.Series({
    'awtmean_herf': wt_mean(x['herf_s'], weights = x['a1_sale']),
    'awtmean_mherf': wt_mean(x['mherf'], weights = x['a1_sale'])
})

data = data.merge(gby_y.apply(wt_a), right_index = True, left_on = 'year')

# Coverage metrics
data.eval('''
a1_niv = a1_nik_all_bea/a1_mv.shift(1)
a_niv1 = a_nik_all_bea/a_mv.shift(1) 
a_payos = a_pay/a_os_bea
a_bbos = a_bb/a_os_bea

a1c_ppek = a1_ppe/(1000*a1_kp_all_bea)
a1c_inv = a1_inv1/(1000*a1_kp_all_bea)
''', inplace = True)

# Fix shift and count
data.loc[~shift_ok, ['a1_niv', 'a_niv1']] = None
data.loc[~count_ok, ['a1_niv']] = None

# Clip
data['a1c_ppek'] = data['a1c_ppek'].clip(upper = 1) 
data['a1c_inv'] = data['a1c_inv'].clip(upper = 1) 

# Mean by industry
data['avga1c_ppek'] = data.query('year > 200').groupby('indcode')['a1c_ppek'].transform('mean')
data['avga1c_inv'] = data.query('year > 200').groupby('indcode')['a1c_inv'].transform('mean')

# Dividing produced some np.infs, replace with NA
data.replace([np.inf, -np.inf], np.nan, inplace = True)

# Save as ind dataset
data.to_stata('Data/Final/main_dataset_ind_BEA.dta', write_index = True)

In [11]:
# Select columns starting with a or contiang href, except firm specific ones
ind_cols = ['at', 'act', 'aqc', 'apalch', 'aoloch', 'aldo', 'ap', 'age', 'at_l', 'aqcind', 'aldoat', 'aqcat']
c = pd.Series(list(set(data.columns) - set(ind_cols)))
c = c[c.str.contains(r'^a.*|.*herf.*')]

# Update firm columns
data = data_firm.merge(data[list(c) + ['indcode', 'year']], on = ['indcode', 'year'])

# Keep only new columns
cx = data.columns[data.columns.str.contains(r'_x')]
cy = data.columns[data.columns.str.contains(r'_y')]

data.drop(columns = cx, inplace = True)

rename = {}
for x in cy:
    rename[x] = x[:-2]
    
data.rename(columns = rename, inplace = True)

# Save
data.to_stata('Data/Final/main_dataset_firm_BEA.dta', write_index = True)

In [13]:
import os

# Rename images
for file in os.listdir('Figures'):
    new_name = file.split('_')[0] + '.eps'
    
    os.rename('Figures/' + file, 'Figures/' + new_name)
    #print(new_name)

# Rename tables
for file in os.listdir('Tables'):
    file_parts = file.split('_')
    
    new_name = file_parts[0]
    new_suffix = file_parts[-1].split('.')[-1]
    new_file = new_name + '.' + new_suffix
    
    #print(new_file)
    os.rename('Tables/' + file, 'Tables/' + new_file)