In [66]:
import wrds

import pandas as pd
import numpy as np

from ff_category import FFCategory
from utilities import *

pd.options.mode.chained_assignment = None

In [67]:
# Initialize connection
db = wrds.Connection(wrds_username='tadej')

Loading library list...
Done


# TODO

- Use SIC (check it it means the latest or what) and NAICS instead of H versions [DONE]
- Check the goodwill thing (where only max is kept)
- Figure out how stata treats missing vars for lags
- Did they miscode bank dependence??
- Median/mean with NA [DONE]

# Compustat

Here I get the required compustat data.

## Annual fundamentals

### Download

I get the a lot of variables from `funda` table in the `compd` database (see the SQL call bellow).

The only filtering I do is to throw out things without a year (happens sometimes), and apply the standard, criptic data selection that prevents duplicates.

Additionally, I do the join with the `compd.company` table to get `sic` and `naics` (**thank you EJMR**).

In [68]:
# SQL select string
select_str = '''
SELECT f.gvkey, fyear AS year, 
       CAST(c.sic AS integer) AS sic, CAST(c.naics AS integer) AS naics,
       f.fic, csho, prcc_f, at, lt, pstk, dltt, dlc, 
       act, ppegt, dvt, prstkc, oiadp, txt, xint, che, ppent, capx,
       gdwl, sppe, siv, ivstch, ivaco, scf, wcapc, dlcch, chech, ibc,
       xidoc, dpc, txdc, esubc, sppiv, fopo, fsrco, exre, dltis, dltr,
       sstk, ivch, aqc, fuseo, recch, invch, apalch, txach, aoloch,
       fiao, dv, xrd, "do", txdb, ib, dp, sale, emp, dd1, dd3, dd5, aldo,
       intan, pifo, pi, fopt, ap, invt, rect, cogs, xsga, oibdp, ivaeq, ivao,
       dlrsn
FROM compd.funda AS f INNER JOIN compd.company AS c ON c.gvkey = f.gvkey
WHERE indfmt='INDL' AND datafmt='STD' AND popsrc='D' AND consol='C'
        AND fyear IS NOT NULL
'''

# Get data
data_fa = db.raw_sql(select_str)

# Still there are, for some reason, 4 duplicates: remove them
data_fa.drop_duplicates(subset = ['year', 'gvkey'], inplace = True)

### New variables

Here I create the following new variables:
- `me`: Market value of equity, obtained by multiplying common shares outsdanding with their price 
- `be`: Book value of equity, equal to total assets minus liabilities minus equity (some measure of it)
- `bliab`: Book value of liabilities: assets minus equity
- `mv`: Market value: Market value of equity + assets - book equity
- `blev`: Leverage: book value of libilities divided by assets
- `q`: Tobin's q (market value of the firm divided by assets)
- `q2`: Alternative Tobin's q
- `paya`: Payouts over assets
- `bba`: Buybacks over assets
- `os`: Net operating surplus
- `payos`: Payouts over operating surplus
- `bbos`: Buybacks over operating surplus
- `ca`: Cash holdings over assets

In [69]:
data_fa.eval('''
    me = csho*prcc_f 
    be = at - lt - pstk 
    bliab = at - be
    blev = bliab/at
    mv = me + at - be
    q = mv/at
    q2 = (me+dltt+dlc-act)/ppegt
    paya = (dvt + prstkc)/ at
    bba = prstkc / at
    os = oiadp - txt - xint  
    payos = (dvt + prstkc)/ os
    bbos = prstkc/ os
    ca = che / at''', inplace = True)

## Peters and Taylor Q

Here I download the Peters and Taylor (improved) Q measure, that is the `total_q` table in the `totalq` database.

I download the variables `gvkey`, `year` (`fyear`), `q_tot`, `k_int` and `k_int_offbs`.

In [70]:
# SQL select string
select_str = '''
SELECT fyear AS year, gvkey, q_tot, k_int, k_int_offbs
FROM totalq.total_q
'''

# Get data
data_q = db.raw_sql(select_str)

## Ratings data

I download the data from Compustat, specifically, I use the Ratings table under North America daily. I then apply a recoding of the ratings, as shown below.

**Warning**: That dataset is obsolete, it has not been updated since 2017. There is a newer credit ratings dataset (under Capital Q), however, based on my investigation, it has much less data than the old one (and some data it has is ambiguous), so it is not a suitable substitute here.

In [71]:
select_str = '''
SELECT date_part('year', datadate) AS year, splticrm, gvkey
FROM compd.adsprate
WHERE date_part('month', datadate) = 12
AND splticrm IS NOT NULL
AND splticrm NOT IN ('N.M.', 'Suspended', 'SD')
'''

data_r = db.raw_sql(select_str)

# Numerify the ratings
replace_dict = {
    'AAA': 1, 'AA': 2, 'A': 3, 'BBB': 4, 'BB': 5, 'B': 6,
    'CCC': 7, 'CC': 8, 'C': 9, 'D': 10
}

data_r['sprating'] = data_r['splticrm']

# The order is important here
for key, value in replace_dict.items():
    data_r.loc[data_r.sprating.str.contains(key), 'sprating'] = str(value)
    
data_r['sprating'] = data_r['sprating'].astype(int) 

# Create some rating booleans
data_r['AAtoAAA'] = (data_r.sprating <= 2).astype(int)
data_r['BBBtoA'] = ((data_r.sprating <= 4) & (data_r.sprating >= 3)).astype(int)

# Drop some variables
data_r.drop(columns = ['splticrm'], inplace = True)

## Merging and minor calculations

Here I merge the three datasets downloaded so far, and compute some new variables:
- `k_pt`: A measure of capital, basically adding physical and intangible capital together
- `shareintant`: Share of intangible capital in the above measure

After that, I sort data by `gvkey` and `year` and add cummulative count for `gvkey` : this will be the age of the firm (assuming it appears in the dataset every year of its existence). I then log this to produce `logage`.

In [72]:
# Merge fundamentals annual and Q data and then ratings as well
data = data_fa.merge(data_q, on = ['gvkey', 'year'], how = 'left')\
              .merge(data_r, on = ['gvkey', 'year'], how = 'left')

# New variables
data.eval('''
    k_pt = ppent + k_int
    shareintan = k_int / k_pt''', inplace = True)

# Sort and age
data.sort_values(['gvkey', 'year'], inplace = True)
data['age'] = data.groupby('gvkey').cumcount()
data['logage'] = np.log(data.groupby('gvkey').cumcount() + 1)

## Aggregates pre-filtering

Here I compute some aggregates (by year), which are later used to produce a graph. These aggregates are:
- `a_capx_all_preEx`: sum of `capx`
- `a_capx_US_preEx`: sum of `capx`, only for US firms
- `a_pay_preEx`: sum of `dvt` and `prstkc`
- `a_prstkc_preEx`: sum of `prstkc`
- `a_at_preEx`: sum of `at`
- `a_paya_preEx = a_pay_preEx / a_at_preEx`
- `a_bba_preEx = a_prstkc_preEx / a_at_preEx`

**Note**: Check the thing about goodwill later

In [73]:
# Compute aggregate variables from data
data_agg = data.groupby('year').apply(lambda x: pd.Series({
    'a_capx_all_preEx': x['capx'].sum(min_count = 1),
    'a_capx_US_preEx': x.query('fic == "USA"')['capx'].sum(min_count = 1),
    'a_pay_preEx': x.eval('dvt + prstkc').sum(min_count = 1),
    'a_prstkc_preEx': x['prstkc'].sum(min_count = 1),
    'a_at_preEx': x['at'].sum(min_count = 1)
}))

# Add some composite aggregate variables
data_agg.eval('''
    a_paya_preEx = a_pay_preEx / a_at_preEx
    a_bba_preEx = a_prstkc_preEx / a_at_preEx''', inplace = True)

## Filtering and aggregates

Finally, I do some filtering:
1. Drop all entries where `at`, `gvkey`, `be`, `me`, `bliab` or `q` are NA, as well as those where `at` is smaller than 1 (million), and where either `be` or `me` are negative.
2. Drop all entries with years smaller or equal to 1961, and drop a peculiar case with `gvkey` 4828 and `year` 2001.
3. Drop all entries with `sic` in \[4900, 4999\] (utilities), \[6000, 6999\] (financials) or \[5300, 5399\] (real estate)
4. Keep only US companies (`fic = USA`)

At this point I also compute some aggregates (I do this right after step 3).

Finally, I assign the Fama-French categories to the SIC numbers (based on my own implementation).

In [74]:
# Drop NAs, filtering
data.dropna(subset = ['year', 'gvkey', 'be', 'me', 'bliab', 'q'], inplace = True)
data.query('at >= 1 & be > 0 & me > 0 & year > 1961 & ~(gvkey == 4828 & year == 2001)', inplace = True)

# Exclude some sic codes
data.query('~sic.between(4900, 4999) & ~sic.between(6000, 6999) & ~sic.between(5300, 5399)', inplace = True)

# Some additional aggregates
k = data.groupby('year').apply(lambda x: pd.Series({
    'a_capx_all_wEx': x['capx'].sum(min_count = 1),
    'a_capx_US_wEx': x.query('fic == "USA"')['capx'].sum(min_count = 1),
}))

data_agg[['a_capx_all_wEx', 'a_capx_US_wEx']] = k

# Drop non-USA
data.query('fic == "USA"', inplace = True)

# Get the 10 category FF classification
ff = FFCategory('https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp/Siccodes10.zip')
data['ff10'] = ff.assign_ff(data.sic)

## Table 10

Here I reproduce the table 10 from the paper, which shows quantiles for different Q measures over two time periods: 1975 - 1980 and 2010 - 2015.

As you can see in the table bellow (and by comparing it with the one from the paper), we have almost the exact same numbers, except for extreme quantiles.

In [75]:
# Quantiles to get
quantiles = [0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]
    
# Get quantiles
q1 = data.query('year.between(1975, 1980)')[['q', 'q2', 'q_tot']].quantile(quantiles).round(1)
q2 = data.query('year.between(2010, 2015)')[['q', 'q2', 'q_tot']].quantile(quantiles).round(1)

# Joint table
qs_table = pd.concat([q1, q2], keys = ['1975-1980', '2010-2015'], axis = 1)

# Save to tables
qs_table.to_csv('Tables/Table10.csv')

# Show results
qs_table

Unnamed: 0_level_0,1975-1980,1975-1980,1975-1980,2010-2015,2010-2015,2010-2015
Unnamed: 0_level_1,q,q2,q_tot,q,q2,q_tot
0.01,0.5,-4.8,-1.0,0.5,-6.3,-0.9
0.05,0.7,-1.5,-0.5,0.7,-0.6,-0.2
0.1,0.7,-0.8,-0.3,0.9,0.0,0.0
0.25,0.8,-0.2,-0.1,1.1,0.7,0.3
0.5,1.0,0.3,0.2,1.6,2.2,0.8
0.75,1.3,0.9,0.6,2.5,8.1,1.6
0.9,1.9,2.5,1.4,4.4,34.1,3.4
0.95,2.7,4.7,2.4,6.5,99.4,5.9
0.99,6.6,26.1,9.6,16.4,inf,23.6


## Segmentation mapping

Here I create some segmentation variables (mostly NAICS related).

### NAICS 3

I create a new variable `naics3` from the first 3 digits of `naics`. Where this is missing, I do the following assignment:
- For each sic and naics pair I count the number of entries appearing in it.
- For each sic, I take the highest NAICS (from the previous step)
- When possible, I match this naics to corresponding SIC

Additionally, I also manually adjust some naics values.

In [76]:
# Naics to string
data['naics_str'] = data.naics.astype(str).str.slice(0, -2)
data.loc[data.naics.isna(), 'naics_str'] = np.nan

# Default NAICS3
data['naics3'] = data.loc[data.naics_str.str.len() >= 3, 'naics_str'].str.slice(0,3)
data['naics3'] = data['naics3'].astype(float)

# Create the matching dictionary
agg = data.groupby(['sic', 'naics3']).size()
match_dict = agg.groupby(level = 0, group_keys = False).nlargest(1)
match_dict = match_dict.reset_index().drop(columns= [0])

# Reset index before merging
data = data.reset_index(drop=True)

# Match
naics3_matched = data[['sic']].merge(match_dict, on = 'sic', how = 'left')['naics3']
data.loc[data.naics3.isna(), 'naics3'] = naics3_matched

# Special matching
data.loc[data.naics3.isna() & (data.sic == 3412), 'naics3'] = 332

### BEA Codes

Here I map to BEA codes, segments and industry, using the provided excel and stata files. After that I merge with the computed BEA file, using the industry depreciation.

In [77]:
# Read in files
bea_codes = pd.read_excel('Data/User inputs/NAICS2BEA.xlsx').rename(columns={'naics': 'naics3'})
bea_segments = pd.read_stata('Data/Temp/levelkey.dta').rename(columns={'ind_short': 'indcode'})
bea_industry = pd.read_stata('Data/Intermediate/BEA_industry.dta').rename(columns={'ind_short': 'indcode'})

# Some filtering on bea_industry
bea_industry = bea_industry.loc[:, ['indcode', 'year'] + [x for x in bea_industry.columns if 'a1_depk_' in x]]

# Perform the merges (they have to be done in sequence)
data = data.merge(bea_codes, how = 'left')\
           .merge(bea_segments, how = 'inner')\
           .merge(bea_industry, how = 'left')

### NAICS 6 + 4

I create a new variable `naics6` from the first 6 digits of `naics`. Same adjustment for missing as in the case of `naics3`.
After this matching, I take the first 4 digits of `naics6` and create `naics4`.

In [78]:
# Create 6 letter substring of naics
data['naics6'] = data.naics_str.str.slice(0,6)
data.loc[data['naics6'].str.len() < 6, 'naics6'] = None
data['naics6'] = data['naics6'].astype(float)

# Create the matching dictionary
agg = data.groupby(['sic', 'naics6']).size()
match_dict = agg.groupby(level = 0, group_keys = False).nlargest(1)
match_dict = match_dict.reset_index().drop(columns= [0])

# Reset index before merging
data = data.reset_index(drop=True)

# Match
naics6_matched = data[['sic']].merge(match_dict, on = 'sic', how = 'left')['naics6']
data.loc[data.naics6.isna(), 'naics6'] = naics6_matched

# Generate naics4
data['naics4'] = data['naics6'].astype(str).replace('nan', np.nan).str.slice(0,4).astype(float)

## New variables

Here I create a bunch of new variables.

### Financing needs and issuance

Here I compute financing deficit and issuance following Frank and Goyal (2003).

In [79]:
# Function that mimics Stata's rowtotal
def rowtotal(data, cols, signs = None):
    if signs == None:
        signs = np.repeat(1, len(cols))
    else:
        signs = np.array(signs)

    result = (data[cols] * signs).sum(axis = 1, min_count = 1)
            
    return result

In [80]:
# Compute some variables, for the additions on them we will want to treat NAs as 0
invdef13 = rowtotal(data, ['capx', 'ivch', 'aqc', 'fuseo', 'sppe', 'siv'], [1,1,1,1,-1,-1])
invdef7 = rowtotal(data, ['capx', 'ivch', 'aqc', 'sppe', 'siv', 
                                  'ivstch', 'ivaco'], [1,1,1,-1,-1,-1,-1])

dnwc1 = rowtotal(data, ['wcapc', 'chech', 'dlcch'], [1,1,1])
dnwc23 = rowtotal(data, ['wcapc', 'chech', 'dlcch'], [-1,1,-1])

dnwc7 = -rowtotal(data, ['recch', 'invch', 'apalch', 'txach', 
                                 'aoloch', 'chech', 'fiao', 'dlcch'], [1,1,1,1,1,-1,1,1])
incf13 = rowtotal(data, ['ibc', 'xidoc', 'dpc', 'txdc', 'esubc', 'sppiv', 'fopo', 'fsrco'])
incf7 = rowtotal(data, ['ibc', 'xidoc', 'dpc', 'txdc', 'esubc', 'sppiv', 'fopo', 'exre'])

# Compute some composite variables
data['inv_def'] = np.select([data['scf'].between(1, 3), data['scf'] == 7],
                            [invdef13, invdef7], np.nan)

data['dnwc_def'] = np.select([data['scf'] == 1, data['scf'].between(2, 3), data['scf'] == 7],
                             [dnwc1, dnwc23, dnwc7], np.nan)

data['incf_def'] = np.select([data['scf'].between(1, 3), data['scf'] == 7],
                             [incf13, incf7], np.nan)

# Compute finance deficit and issuance
data.eval('''
    findef = dv + inv_def + dnwc_def - incf_def 
    ndebtiss = dltis - dltr
    neqiss = sstk - prstkc
''', inplace = True)

# Some NA stuff
new_vars = ['findef', 'ndebtiss', 'neqiss', 'inv_def', 'dnwc_def', 'incf_def']
data[new_vars[:3]] = data[new_vars[:3]].dropna()
data.loc[data['year'] < 1971, new_vars] = np.nan

In [81]:
# Sort first
data = data.sort_values(['gvkey', 'year']).reset_index(drop = True)

# Create some stuff with lags/leads
data['shift_not_ok'] = data['gvkey'] != data['gvkey'].shift(1)
data['at_l'] = data['at'].shift(1)
data.loc[data['shift_not_ok'], 'at_l'] = None

data.eval('''
    cdat = dv/(at+at_l)
    invdefat = inv_def/(at+at_l)
    dwcat = dnwc_def/(at+at_l)
    dincfat = incf_def/(at+at_l)
    
    defat = findef/(at+at_l)
    diat = ndebtiss/(at+at_l)
    eiat = neqiss/(at+at_l)
    dfpct = diat/defat
    efpct = eiat/defat
''', inplace = True)

In [82]:
# Trim some vars
data['bba'] = data['bba'].clip(upper = 0.1)
data['paya'] = data['paya'].clip(upper = 0.1)
data['bbos'] = data['bbos'].clip(upper = 2)
data['q'] = data['q'].clip(upper = 10)
data['q2'] = data['q2'].clip(upper = 15)
data['q_tot'] = data['q_tot'].clip(upper = 10)

data['q_tot'] = data.groupby('year')['q_tot'].transform(winsor)
data['defat'] = data.groupby('year')['defat'].transform(winsor)
data['diat'] = data.groupby('year')['diat'].transform(winsor)
data['eiat'] = data.groupby('year')['eiat'].transform(winsor)
data['dfpct'] = data.groupby('year')['dfpct'].transform(winsor)
data['efpct'] = data.groupby('year')['efpct'].transform(winsor)

In [83]:
# Compute some aggregates
var_agg = ['defat', 'diat', 'eiat', 'dfpct', 'efpct', 'cdat', 'invdefat', 'dwcat', 'dincfat']

for var in var_agg:
    data[f'a1m_{var}'] = data.groupby(['indcode', 'year'])[var].transform(np.mean)
    data[f'am_{var}'] = data.groupby(['year'])[var].transform(np.mean)
    data[f'a1med_{var}'] = data.groupby(['indcode', 'year'])[var].transform(np.median)
    data[f'amed_{var}'] = data.groupby(['year'])[var].transform(np.median)

### Core variables for investment analyses

Here I compute the following definitions:
1. Capx/PP&E
2. dIntan/Intan --> from Peters & Taylor
3. R&D/assets
4. (Capx + R&D)/assets
5. Net I/K 
6. dAT/AT

In [84]:
# Adjust xrd
data['xrd'] = data['xrd'].fillna(0)

# Create a bunch of vars
data.eval('''
    inv1 = capx
    kdef1 = ppent
    inv2 = k_int.diff(1) 
    kdef2 = k_int
    inv3 = xrd
    kdef3 = at
    inv4 = capx + xrd 
    kdef4 = at
''', inplace = True)

# Before we proceed, let's fix the diff and some NAs
data.loc[data['shift_not_ok'], 'inv2'] = None

#Ok, we continue
data.eval('''
    dp_used1 = kdef1.shift(1) * a1_depk_exip_bea
    dp_used2 = 0
    dp_used3 = kdef3.shift(1) * a1_depk_ip_bea
    dp_used4 = kdef4.shift(1) * a1_depk_all_bea
    dp_used5 = ppent.shift(1) * a1_depk_all_bea
    dp_used6 = 0
    
    inv5 = capx  + inv2
    kdef5 = ppent + k_int
    
    inv6 = at.diff()
    kdef6 = at
''', inplace = True)

# Fix the vars
data.loc[data['shift_not_ok'], [f'dp_used{i}' for i in range(1,7)] + ['inv6']] = None

# Drop some stuff
data = data.loc[:,~data.columns.str.contains('a1_depk')]

In [85]:
# Actually compute those variables
for i in range(1,7):
    data.eval(f'''
        ik{i} = inv{i}/kdef{i}.shift(1)
        nik{i} = (inv{i}-dp_used{i})/kdef{i}.shift(1)
        ios{i} = inv{i}/os
        nios{i} = (inv{i}-dp_used{i})/os
        niv{i} = (inv{i}-dp_used{i})/mv.shift(1)        
    ''', inplace = True)
    
    # Fix lags
    data.loc[data['shift_not_ok'], [f'ik{i}', f'nik{i}', f'nios{i}', f'niv{i}']] = None
    
# Drop some vars
data.drop(columns = ['ik2', 'ik5', 'ik6', 'ios5', 'ios6'], inplace = True)

In [86]:
# More variables
data.eval('''
    logat = log(at)
    nblev = (bliab - che)/at
    txtoi = txt/oiadp
    txdba = txdb/at
    cf = ib + dp
    logsale = log(sale)
    dlogsale = logsale.diff()
    xrdat = xrd/at
    xrdsale = xrd/sale

    osk = os/ppent.shift()
    osat = os/at.shift()

    cfat = cf/at.shift()
    cfk = cf/ppent.shift()

    logemp = log(emp)
    logq = log(q)
    logq2 = log(q2)
    logppe = log(ppent)
    dlogemp = logemp.diff()
    dlogppe = logppe.diff()

    kemp = ppent/emp
    kemp_PT = k_pt/emp
''', inplace=True)

# Fix some shifts
fix_vars = ['dlogsale', 'osk', 'osat', 'cfat', 'cfk', 'dlogemp', 'dlogppe']
data.loc[data['shift_not_ok'], fix_vars] = None

  out_arr[res_indexer] = arr[res_indexer] - arr[lag_indexer]
  out_arr[res_indexer] = arr[res_indexer] - arr[lag_indexer]
  out_arr[res_indexer] = arr[res_indexer] - arr[lag_indexer]


In [87]:
# Is the firm continuing - did it appear before 1995 and kept appearing in 2010
cnt_firm = lambda x: (x.min() <= 1995) & (x.max() >= 2010)
data['continuing'] = data.groupby('gvkey')['year'].transform(cnt_firm)

# Is the firm in manufacturing
data['manufacturing'] = data['naics3'].between(310, 340)

# Top X for at by year and indcode
data['rank'] = data.groupby(['year', 'indcode'])['at'].apply(lambda x: x.rank(ascending = False))
data['top5'] = data['rank'] <= 5
data['top3'] = data['rank'] <= 3

# Top X for me by year and indcode
data['rank'] = data.groupby(['year', 'indcode'])['me'].apply(lambda x: x.rank(ascending = False))
data['top5'] = data['rank'] <= 5
data['top3'] = data['rank'] <= 3

data.drop(columns = ['rank'], inplace = True)

# Divestitures
data['doind'] = (data['do'] != 0).fillna(False)
data['sppeind'] = (data['sppe'] > 0).fillna(False)
data['sivind'] = (data['siv'] > 0).fillna(False)
data['aqcind'] = (data['aqc'] > 0).fillna(False)

data.eval('''
    sppek = sppe/ppent.shift(1)
    aldoat = aldo/at.shift(1)
    sivat = siv/at.shift(1)
    aqcat = aqc/at.shift(1)
    ltd = dltt + dd1
''', inplace = True)

# Fix lags
data.loc[data['shift_not_ok'], ['aldoat', 'sivat', 'aqcat', 'sppek']] = None

data['dd1d'] = (data['dd1']/data['ltd']).clip(0,1)
data['dd3d'] = (data['dd3']/data['ltd']).clip(0,1)
data['dd5d'] = (data['dd5']/data['ltd']).clip(0,1)

# More row totals
data['dd3c'] = rowtotal(data, ['dd1', 'dd3'], [1,-1])
data['dd5c'] = rowtotal(data, ['dd1', 'dd5'], [1,-1])

In [88]:
# Goodwill and attributes
data.eval(''' 
    gwa = gdwl/at
    intanexgw = intan - gdwl
    intanexgwat = intanexgw/at
    intanat = intan/at

    pifoadj = pifo.fillna(0)
    pifo_sh = pifo/pi
    pifoadj_sh = pifoadj/pi
    pifoind = pifo.notna()
''', inplace = True)

# Winsorize
cols = data.columns[data.columns.str.contains('^(ik|nik|niv|ios|nios)')]

for col in cols:
    data[col] = data.groupby('year')[col].transform(winsor)
    
cols = ['osk', 'payos', 'cfat', 'pifo_sh', 'pifoadj_sh', 'intanat', 'intanexgwat', 'xrdat']
for col in cols:
    data[col] = data.groupby('year')[col].transform(winsor, l = 0.01)

  from ipykernel import kernelapp as app


### Measures of dependence on external finance

In [89]:
cf_rz13 = data['fopt'] + data['ap'].diff() - data['invt'].diff() - data['rect'].diff()
data['fopt2'] = rowtotal(data, ['ibc', 'dpc', 'txdc', 'esubc', 'sppiv', 'fopo'])
cf_rz7 = data['fopt2'] + data['ap'].diff() - data['invt'].diff() - data['rect'].diff()

cf_rz13.loc[data['shift_not_ok']] = None
cf_rz7.loc[data['shift_not_ok']] = None

data['cf_rz'] = np.select([data['scf'].between(1, 3), data['scf'] == 7],
                          [cf_rz13, cf_rz7], np.nan)

# 10 year lagged total, with some conditions
# First check if all of neqiss, ndebtiss, and capx present
data_temp = data.loc[:, ['ndebtiss', 'neqiss', 'capx', 'cf_rz']]
vars_not_ok = data_temp.isna().any(axis = 1)
data_temp.loc[vars_not_ok, :] = 0 
gby = data_temp.groupby(data['gvkey'])

# Do the rolling sums - maybe min_periods should be added (set to 1?)
data['cum_ndebtiss'] = gby['ndebtiss'].rolling(10).sum().droplevel(0)
data['cum_neqiss'] = gby['neqiss'].rolling(10).sum().droplevel(0)
data['cum_capx'] = gby['capx'].rolling(10).sum().droplevel(0)
data['cum_cf_rz'] = gby['cf_rz'].rolling(10).sum().droplevel(0)

# Some vars
data.eval('''
    extfindep_rz = (cum_capx - cum_cf_rz) / cum_capx
    exteqfindep_rz = cum_neqiss / cum_capx
    extdebtfindep_rz = cum_ndebtiss / cum_capx
''', inplace = True)

# Winsorizing
data['extfindep_rz'] = data.groupby('year')['extfindep_rz'].transform(winsor, l = 0.01)
data['exteqfindep_rz'] = data.groupby('year')['exteqfindep_rz'].transform(winsor, l = 0.01)
data['extdebtfindep_rz'] = data.groupby('year')['extdebtfindep_rz'].transform(winsor, l = 0.01)

# Bank dependence
data['bankdep'] = data['sprating'].isna() & (data['ltd'] > 0)

### Measures of volatility

Here I compute the volatility of the sales (for some reason, actually of the log difference of sales), and of stock returns. 

Stock returns are obtained from WRDS, specifically from `compd.secm` database. I take `gvkey, trt1m (AS ret), datadate` variables, filtering to those where `ret` is not NA, `prclm` is higher than zero and `iid` is "01". 

In [90]:
# Std deviation for 5 or 10 period rolling window od dlogsale
gby = data.groupby('gvkey')['dlogsale']
data['sig_g5'] = gby.rolling(5, center = True).std(ddof = 0).droplevel(0)
data['sig_g10'] = gby.rolling(10, center = True).std(ddof = 0).droplevel(0)

# Get stock returns
data_ret = db.raw_sql('''
    SELECT gvkey, trt1m AS ret, datadate
    FROM compd.secm 
    WHERE trt1m IS NOT NULL AND prclm > 0 AND iid = '01' 
''')
data_ret['datadate'] = pd.to_datetime(data_ret['datadate'])

# Create gvkey - indcode dict
gvkey_indcode_dict = data[['gvkey', 'indcode']].drop_duplicates()

# Merge dict to returns data
data_ret = data_ret.merge(gvkey_indcode_dict, how = 'left')
data_ret = data_ret.sort_values(['gvkey', 'datadate'])

# Stdev and 6 month MA of stdev, by indcode
data_ret = data_ret.pivot_table(index = ['indcode', 'datadate'], values = ['ret'], 
                                aggfunc = np.std).reset_index()\
                   .sort_values(['indcode', 'datadate'])

data_ret['a1_stocksig'] = data_ret.groupby(['indcode'])['ret']\
                                  .rolling(6, center = True).mean().droplevel(0)

# Get year, merge back to main
data_ret['year'] = data_ret['datadate'].dt.year
data_ret = data_ret.query('datadate.dt.month == 12')
data = data.merge(data_ret[['year', 'indcode', 'a1_stocksig']], how = 'left')

### Industry metrics for investment analyses

In [91]:
# Investment
gby = data.groupby(['indcode', 'year'])
gby_a = data.groupby('year')

for i in range(1,7):
    data[f'a1_inv{i}'] = gby[f'inv{i}'].transform('sum', min_count = 1)
    data[f'a1_kdef{i}'] = gby[f'kdef{i}'].transform('sum', min_count = 1)
    data[f'a1_dp{i}'] = gby[f'dp_used{i}'].transform('sum', min_count = 1)
    
    data[f'a_inv{i}'] = gby_a[f'inv{i}'].transform('sum', min_count = 1)
    data[f'a_kdef{i}'] = gby_a[f'kdef{i}'].transform('sum', min_count = 1)
    data[f'a_dp{i}'] = gby_a[f'dp_used{i}'].transform('sum', min_count = 1)
    
data['cfother'] = data['incf_def'] - rowtotal(data, ['ibc', 'xidoc', 'dpc', 'txdc'])

In [92]:
# B/S metrics and use of proceeds
data['temp'] = data['dvt'] + data['prstkc']

gby = data.groupby(['indcode', 'year'])
gby_a = data.groupby('year')

vars_s = '''at emp bliab sale cogs xsga mv me k_int k_int_offbs ndebtiss
            neqiss dv findef inv_def dnwc_def ibc xidoc dpc txdc cfother
            che txt txdb oibdp cf gdwl cf_rz pifo pi capx ivch aqc sppe
            siv ivstch ivaco xrd ivaeq ivao intan invt dd1 dd3c dd5c ltd incf_def'''
vars_a = [x.strip() for x in vars_s.split(' ') if len(x) > 1]

for var in vars_a:
    data[f'a1_{var}'] = gby[var].transform('sum', min_count = 1)
    data[f'a_{var}'] = gby_a[var].transform('sum', min_count = 1)

data['a1_dp_cs'] = gby['dp'].transform('sum', min_count = 1)
data['a_dp_cs'] = gby_a['dp'].transform('sum', min_count = 1)
data['a1_os_cp'] = gby['os'].transform('sum', min_count = 1)
data['a_os_cp'] = gby_a['os'].transform('sum', min_count = 1)
data['a1_ppe'] = gby['ppent'].transform('sum', min_count = 1)
data['a_ppe'] = gby_a['ppent'].transform('sum', min_count = 1)
data['a1_pay'] = gby['temp'].transform('sum', min_count = 1)
data['a_pay'] = gby_a['temp'].transform('sum', min_count = 1)
data['a1_bb'] = gby['prstkc'].transform('sum', min_count = 1)
data['a_bb'] = gby_a['prstkc'].transform('sum', min_count = 1)

data['a1_logsale'] = np.log(data['a1_sale'])
data['a1_logat'] = np.log(data['a1_at'])

# k/emp + Peters & Taylor measures
data.eval('''
    a1_kemp = a1_ppe/a1_emp
    a_kemp = a_ppe/a_emp

    a1_shareintan_PT = a1_k_int/(a1_ppe+a1_k_int)
    a1_share_int_offbs = a1_k_int_offbs /a1_k_int

    a_shareintan_PT = a_k_int/(a_ppe+a_k_int)
    a_share_int_offbs = a_k_int_offbs /a_k_int
''', inplace = True)

# CAPX + RD (same as ik4) and adjusted Q
data['wt_pifo'] = 1 - data['pifoadj']/data['pi']
data['temp'] = data['capx'] + data['xrd']
data['temp1'] = data['mv'] * data['wt_pifo']
data['temp2'] = data['at'] * data['wt_pifo']

gby = data.groupby(['indcode', 'year'])
gby_a = data.groupby('year')

data['a_capxrd'] = gby_a['temp'].transform('sum', min_count = 1)

data['a1_mvadj'] = gby_a['temp1'].transform('sum', min_count = 1)
data['a1_atadj'] = gby_a['temp2'].transform('sum', min_count = 1)
data['a_mvadj'] = gby_a['temp1'].transform('sum', min_count = 1)
data['a_atadj'] = gby_a['temp2'].transform('sum', min_count = 1)

In [93]:
# Number of firms
data['a1sic_N'] = data.groupby(['siccode', 'year'])['gvkey'].transform('count')
data['a1_N'] = data.groupby(['indcode', 'year'])['gvkey'].transform('count')
data['a_N'] = data.groupby('year')['gvkey'].transform('count')

data['a1sic_logN'] = np.log(data['a1sic_N'])
data['a1_logN'] = np.log(data['a1_N'])
data['a_logN'] = np.log(data['a_N'])

# Get bools for entry and exit
data['entry'] = 0
data.loc[data.groupby('gvkey').head(1).index, 'entry'] = 1

data['exit'] = 0
data.loc[data.groupby('gvkey').tail(1).index, 'exit'] = 1

data['exitMA'] = ((data['exit'] == 1) & (data['dlrsn'] == 1)).astype(int)

# Some agg entry/exit vars 
gby_s = data.groupby(['siccode', 'year'])
gby = data.groupby(['indcode', 'year'])
gby_a = data.groupby('year')

data['a_entry'] = gby_a['entry'].transform('sum', min_count = 1)
data['a1_entry'] = gby['entry'].transform('sum', min_count = 1)
data['a1sic_entry'] = gby_s['entry'].transform('sum', min_count = 1)

data['a_exit'] = gby_a['exit'].transform('sum', min_count = 1)
data['a1_exit'] = gby['exit'].transform('sum', min_count = 1)
data['a1sic_exit'] = gby_s['exit'].transform('sum', min_count = 1)

data['a_exitMA'] = gby_a['exitMA'].transform('sum', min_count = 1)
data['a1_exitMA'] = gby['exitMA'].transform('sum', min_count = 1)
data['a1sic_exitMA'] = gby_s['exitMA'].transform('sum', min_count = 1)

data.eval('''
    a_entryrate = a_entry/a_N
    a_exitrate = a_exit/a_N
    a_exitMArate = a_exitMA/a_N

    a1_entryrate = a1_entry/a1_N
    a1_exitrate = a1_exit/a1_N
    a1_exitMArate = a1_exitMA/a1_N

    a1sic_entryrate = a1_entry/a1sic_N
    a1sic_exitrate = a1_exit/a1sic_N
''', inplace = True)

In [94]:
# Winsorizing
data['a1_entryrate'] = data.groupby('year')['a1_entryrate'].transform(winsor, l = 0.03)
data['a1_exitrate'] = data.groupby('year')['a1_exitrate'].transform(winsor, l = 0.03)

# Herfindal and lerner
data.eval('''
    ss1 = sale/a1_sale
    ss1_mv = mv/a1_mv
    ss1_cf = cf/a1_cf

    li = (oibdp - dp) / sale
    a1_li = (a1_oibdp - a1_dp_cs)/a1_sale
    a_li = (a_oibdp - a_dp_cs)/a_sale
''', inplace = True)

gby = data.groupby(['indcode', 'year'])

data['herf_s'] = gby['ss1'].transform(lambda x: (x*x).sum(min_count = 1))
data['herf_mv'] = gby['ss1_mv'].transform(lambda x: (x*x).sum(min_count = 1))
data['herf_cf'] = gby['ss1_cf'].transform(lambda x: (x*x).sum(min_count = 1))

data['li'] = data.groupby('year')['li'].transform(winsor, l = 0.03)

# Concentration
data = data.sort_values(['indcode', 'year', 'sale'], ascending = False).reset_index(drop = True)
gby = data.groupby(['indcode', 'year'])

# Top X share in mv
data['a1_cpcon1_sale'] = gby['sale'].transform(top_x_share, x = 1).min()
data['a1_cpcon2_sale'] = gby['sale'].transform(top_x_share, x = 2).min()
data['a1_cpcon4_sale'] = gby['sale'].transform(top_x_share, x = 4).min()
data['a1_cpcon8_sale'] = gby['sale'].transform(top_x_share, x = 8).min()
data['a1_cpcon20_sale'] = gby['sale'].transform(top_x_share, x = 20).min()
data['a1_cpcon50_sale'] = gby['sale'].transform(top_x_share, x = 50).min()

# Top X share in sales
data['a1_cpcon4_mv'] = gby['mv'].transform(top_x_share, x = 4).min()
data['a1_cpcon8_mv'] = gby['mv'].transform(top_x_share, x = 8).min()
data['a1_cpcon20_mv'] = gby['mv'].transform(top_x_share, x = 20).min()
data['a1_cpcon50_mv'] = gby['mv'].transform(top_x_share, x = 50).min()

# Productivity
data = data.sort_values(['gvkey', 'year']).reset_index(drop = True)
data['shift_not_ok'] = data['gvkey'] != data['gvkey'].shift(1)

data['roc'] = data['oibdp'] / data['ppent'].shift(1)
data.loc[data['shift_not_ok'], 'roc'] = None
data['a1sd_roc'] = data.groupby(['indcode', 'year'])['roc'].transform('std')

In [95]:
# Median and mean
vars_s = '''blev q logq q2 logq2 ik1 ik3 ik4 nik1 nik2 nik3 nik4 nik5 nik6 ios1 ios2 ios3 ios4 nios1 nios2 nios3 nios4 nios5 nios6 
            niv1 niv2 niv3 niv4 niv5 niv6 osk bba bbos paya payos gwa intanat intanexgwat dlogemp dlogppe logat cfat nblev txtoi dlogsale txdba 
            extfindep_rz exteqfindep_rz extdebtfindep_rz pifo_sh pifoadj_sh pifoind xrdat xrdsale sprating AAtoAAA BBBtoA bankdep age logage sig_g5 sig_g10
            sppeind sppek sivind sivat aqcind aqcat kemp li q_tot k_int k_int_offbs shareintan'''
vars_a = [x.strip() for x in vars_s.split(' ') if len(x) > 1]

gby = data.groupby(['indcode', 'year'])
gby_a = data.groupby('year')

for var in vars_a:
    data[f'a1m_{var}'] = gby[var].transform('mean')
    data[f'a1med_{var}'] = gby[var].transform('median')

    data[f'am_{var}'] = gby_a[var].transform('median')
    data[f'amed_{var}'] = gby_a[var].transform('median')
    
for var in ['q', 'logq', 'ik1', 'nik1']:
    data.loc[data['a1_N'] < 5, f'a1med_{var}'] = None
    data.loc[data['a1_N'] < 5, f'a1m_{var}'] = None

In [101]:
# Merge with Bushee
data_bushee = pd.read_stata('Data/Intermediate/bushee_firmmap.dta')
data = data.merge(data_bushee, how = 'left')

# Firm counts
data['populated'] = data.filter(like = 'pctsharetot').notna().any(axis = 1).astype(int)
data['firmcount'] = data.groupby(['indcode', 'year'])['populated'].transform('sum')
 
# Compute aggregate metrics for analyses: 
gby = data.groupby(['year', 'indcode'])
gby_a = data.groupby('year')

for type_inv in ['QIX', 'TRA', 'DED', 'NA']:
    data[f'a1med_owntot{type_inv}'] = gby[f'pctsharetot{type_inv}'].transform('median')
    data[f'a1m_owntot{type_inv}'] = gby[f'pctsharetot{type_inv}'].transform('mean')

    data[f'a1med_ownins{type_inv}'] = gby[f'pctshareins{type_inv}'].transform('median')
    data[f'a1m_ownins{type_inv}'] = gby[f'pctshareins{type_inv}'].transform('mean')

    data[f'amed_owntot{type_inv}'] = gby_a[f'pctsharetot{type_inv}'].transform('median')
    data[f'am_owntot{type_inv}'] = gby_a[f'pctsharetot{type_inv}'].transform('mean')

    data[f'amed_ownins{type_inv}'] = gby_a[f'pctshareins{type_inv}'].transform('median')
    data[f'am_ownins{type_inv}'] = gby_a[f'pctshareins{type_inv}'].transform('mean')
    
    wt_a = lambda x: pd.Series({
        f'a_owntot{type_inv}': wt_mean(x[f'pctsharetot{type_inv}'], weights = x['me']),
        f'a_ownins{type_inv}': wt_mean(x[f'pctshareins{type_inv}'], weights = x['me'])
    })
    wt_a1 = lambda x: pd.Series({
        f'a1_owntot{type_inv}': wt_mean(x[f'pctsharetot{type_inv}'], weights = x['me']),
        f'a1_owntot{type_inv}': wt_mean(x[f'pctshareins{type_inv}'], weights = x['me'])
    })    

    data = data.merge(gby_a.apply(wt_a), right_index = True, left_on = 'year')
    data = data.merge(gby.apply(wt_a1), right_index = True, left_on = ['year', 'indcode'])
    
    # Set to null where there are less than 5 firms
    v_new = [f'a1med_owntot{type_inv}', f'a1m_owntot{type_inv}', f'a1med_ownins{type_inv}',
             f'a1m_ownins{type_inv}', f'amed_owntot{type_inv}', f'am_owntot{type_inv}',
             f'amed_ownins{type_inv}', f'am_ownins{type_inv}', f'a_owntot{type_inv}',
             f'a_ownins{type_inv}', f'a1_owntot{type_inv}', f'a1_owntot{type_inv}']
    
    data.loc[data['firmcount'] < 5, v_new] = None

data.rename(columns = {
    'pctshareinsTRA': 'owninsTRA',
    'pctshareinsQIX': 'owninsQIX',
    'pctshareinsDED': 'owninsDED',
    'pctshareinsNA': 'owninsNA',
    'pctsharetotTRA': 'owntotTRA',
    'pctsharetotQIX': 'owntotQIX',
    'pctsharetotDED': 'owntotDED',
    'pctsharetotNA': 'owntotNA'    
}, inplace = True)

data['a1m_pctinsown'] = gby['pctinsown'].transform('mean')
data['a1med_pctinsown'] = gby['pctinsown'].transform('median')

data['am_pctinsown'] = gby_a['pctinsown'].transform('mean')
data['amed_pctinsown'] = gby_a['pctinsown'].transform('median')

data['a1_pctinsown'] = rowtotal(data, ['a1_owntotTRA', 'a1_owntotQIX', 
                                       'a1_owntotDED', 'a1_owntotNA'])
data['a_pctinsown'] = rowtotal(data, ['a_owntotTRA', 'a_owntotQIX', 
                                       'a_owntotDED', 'a_owntotNA'])

In [32]:
# Final variables
gby_s = data.groupby(['siccode', 'year'])

data['a1sicm_q'] = gby_s['q'].transform('mean')
data['a1sicm_osk'] = gby_s['osk'].transform('mean')
data['a1sicm_logage'] = gby_s['logage'].transform('mean')
data['a1sicm_logat'] = gby_s['logat'].transform('mean')

data['a1sic_mv'] = gby_s['mv'].transform('sum', min_count = 1)
data['a1sic_at'] = gby_s['at'].transform('sum', min_count = 1)
data['a1sic_sale'] = gby_s['sale'].transform('sum', min_count = 1)

data = data.sort_values(['gvkey', 'year']).reset_index(drop = True)
data['shift_not_ok'] = data['gvkey'] != data['gvkey'].shift(1)

data.eval('''
    a1sic_logsale = log(a1sic_sale)
    a1sic_logat = log(a1sic_at)

    logmv = log(mv)
    logme = log(me)
    dlogmv = logmv.diff()
''', inplace = True)

data.loc[data['shift_not_ok'],'dlogmv'] = None

gby = data.groupby(['indcode', 'year'])

data['a1sic_logsale'] = np.log(data['a1sic_sale'])
data['a1sic_logat'] = np.log(data['a1sic_at'])
data['a1mad_logmv'] = gby['dlogmv'].transform('mad')

# Saving the data

In [33]:
# Dividing produced some np.infs, replace with NA
data.replace([np.inf, -np.inf], np.nan, inplace = True)

data.to_stata('Data/Intermediate/data_firm.dta', write_index=False)

/bbkinghome/tadej/miniconda3/envs/cf/lib/python3.7/site-packages/pandas/io/stata.py:2136: InvalidColumnName: 
Not all pandas column names were valid Stata variable names.
The following replacements have been made:

    b'do'   ->   _do

If this is not what you expect, please make sure you have Stata-compliant
column names in your DataFrame (strings only, max 32 characters, only
alphanumerics and underscores, no Stata reserved words)

