In [1]:
from functools import partial

import pandas as pd
import numpy as np
import wrds

import multiprocessing as mp
import multiprocessing.dummy as mpd

from tqdm import tqdm_notebook as tqdm

from utilities import *

In [2]:
db = wrds.Connection(wrds_username='tadej')

Loading library list...
Done


In [3]:
# Import data
select_str = '''
SELECT date_part('year', date) AS year, m.permco, 
       shrout * abs(prc) AS mktcap, substr(ncusip, 1, 8) AS cusip8
FROM crspa.msf AS m INNER JOIN crspa.stocknames AS n ON n.permno = m.permno
WHERE date_part('month', date) = 12 AND
      substr(ncusip, 1, 8) IS NOT NULL
'''

data_crsp = db.raw_sql(select_str)
data_bushee = pd.read_stata('Data/Intermediate/bushee_detailed.dta', columns = ['mgrno_mapped', 
                            'cusip', 'year', 'shares', 'sole', 'shared', 'no'])
data_bushee = data_bushee.rename(columns = {'cusip': 'cusip8', 'mgrno_mapped': 'mgrno'})

In [4]:
# Collapse by sum
data_bushee = data_bushee.groupby(['year', 'cusip8', 'mgrno'])['shares', 'sole', 'shared', 'no']\
                         .sum().reset_index()

data_bushee['temp'] = data_bushee.groupby(['year', 'cusip8'])['shares'].transform('sum')

# Drop those with shares less than 0.005
data_bushee = data_bushee.loc[data_bushee['shares']/data_bushee['temp'] > 0.005, :]

# Recompute share for those with more than 0.005
data_bushee['temp'] = data_bushee.groupby(['year', 'cusip8'])['shares'].transform('sum')
data_bushee['ownshare'] = data_bushee['shares']/data_bushee['temp']

data_bushee['temp'] = data_bushee['sole'] + data_bushee['shared']
data_bushee['temp'] = data_bushee.groupby(['year', 'cusip8'])['temp'].transform('sum')
data_bushee['votshare'] = (data_bushee['sole'] + data_bushee['shared'])/data_bushee['temp']

data_bushee.drop(columns = ['temp'], inplace = True)

# Merge (inner)
data = data_bushee.merge(data_crsp)

In [5]:
# Sort by permco, year and mktcap and keep only largest mktcap within permco, year
data = data.sort_values(['permco', 'mgrno', 'year', 'mktcap']).reset_index(drop = True)
data.drop_duplicates(subset = ['permco', 'year', 'mgrno'], keep = 'last', inplace = True)

# Read in linking table, rename
link_table = pd.read_stata('Data/Raw inputs/linkingTable.dta')
link_table = link_table.rename(columns = {'lpermco': 'permco'})

# Replace NA in linkenddt with today's date (as good as any)
link_table.loc[link_table['linkenddt'].isna(), 'linkenddt'] = pd.to_datetime('today')

In [6]:
# Merge link table, do some filtering
data = data.merge(link_table, how = 'inner')
data.query('linkdt.dt.year <= year & linkenddt.dt.year >= year', inplace = True)

# Sort by gvkey, year, mkt and keep only the largest mkt
data = data.sort_values(['gvkey', 'year', 'mgrno', 'mktcap']).reset_index(drop = True)
data.drop_duplicates(subset = ['gvkey', 'year', 'mgrno'], keep = 'last', inplace = True)

In [7]:
# Merge with firm (inner)
data_firm = pd.read_stata('Data/Intermediate/data_firm.dta', columns = ['gvkey', 'year', 'indcode', 'ss1', 'sale'])
data_firm.dropna(subset = ['gvkey', 'indcode', 'sale'], inplace = True)

data = data_firm.merge(data).drop(columns = ['sale'])

In [8]:
# Function to compute modified HHI
def compute_mHHI(data_name, share_var = 'ownshare'):
    '''
    This function takes as input data for a year/indcode pair, which contains gvkey, ss1, mgrno and ownkey
    columns. It returns a signel value, the modified HHI index.
    '''
    
    name = data_name[1]
    data = data_name[0]
    
    # Own share table
    os_t = data.pivot_table(index = 'gvkey', columns = 'mgrno', values = share_var, fill_value = 0)

    # Own share cross-multiplication table
    # Divide os_mt by the row by sum of squares of os_t (for gvkey) 
    # Equivalent to dividing by b_j @ b_j
    os_mt = (os_t @ os_t.T).div(os_t.pow(2).sum(axis = 1), axis = 0)

    # SS1 cross-multiplication table
    ss = data[['gvkey', 'ss1']].drop_duplicates().set_index('gvkey')
    ss_mt = ss @ ss.T
    
    return pd.Series({name: (os_mt * ss_mt).sum().sum()})

In [9]:
# Create groups to pass to multiprocessing
gby = data[['gvkey', 'ss1', 'mgrno', 'ownshare', 'votshare', 'indcode', 'year']].groupby(['indcode', 'year'])
groups = [(group, name) for name, group in gby]

# Compute the modified hhi
with mp.Pool(mp.cpu_count()) as pool:
    mHHI = pd.concat(tqdm(pool.imap_unordered(compute_mHHI, groups), total = len(groups)))

HBox(children=(IntProgress(value=0, max=1744), HTML(value='')))




In [10]:
# Compute the modified hhi - voting version
compute_v_mHHI = partial(compute_mHHI, share_var = 'votshare')
with mp.Pool(mp.cpu_count()) as pool:
    v_mHHI = pd.concat(tqdm(pool.imap_unordered(compute_v_mHHI, groups), total = len(groups)))

HBox(children=(IntProgress(value=0, max=1744), HTML(value='')))




In [11]:
# Rename index, merge, and save as stata
mHHI.index = mHHI.index.set_names(['indcode', 'year'])
v_mHHI.index = v_mHHI.index.set_names(['indcode', 'year'])

mHHI = mHHI.to_frame('herf_mod')
v_mHHI = v_mHHI.to_frame('herf_votmod')

hhi_data = mHHI.merge(v_mHHI, left_index=True, right_index=True, how = 'outer').reset_index()

hhi_data.query('year.between(1980, 2015)')\
        .to_stata('Data/Intermediate/mod_HHI_BEA.dta', write_index = False)