In [1]:
import pandas as pd
import numpy as np
import hashlib
from ydata_profiling import ProfileReport 

# Importing CSV data pulled form words

In [2]:
ceo_df = pd.read_csv('../inputs/CEOComp.csv', index_col = False)

In [3]:
bod_df = pd.read_csv('../inputs/DirectorComp.csv')

In [4]:
firms_df = pd.read_csv('../inputs/Firm_data.csv')

# Working CEO data

In [5]:
ceo_df.dropna(axis=1, how='all', inplace=True)
ceo_df = ceo_df[ceo_df['CEOANN'].notnull()]
ceo_df = ceo_df[['GVKEY', 'TICKER', 'CEOANN', 'SALARY', 'BONUS', 'STOCK_AWARDS', 'OPTION_AWARDS',
                 'OTHCOMP', 'TOTAL_CURR', 'AGE', 'TDC1', 'YEAR', 'BECAMECEO', 'JOINED_CO',
                 'LEFTOFC', 'GENDER', 'CUSIP', 'NAICSDESC', 'SIC']] 

# Working BOD data

In [6]:
bod_df = bod_df[['GVKEY', 'TICKER', 'TOTAL_SEC', 
                 'OTHCOMP', 'NONEQ_INCENT', 'CASH_FEES', 
                 'STOCK_AWARDS', 'OPTION_AWARDS', 'SPCODE', 
                 'CUSIP', 'CONAME', 'YEAR']]

# Add up compensation packages for that ticker and that year

grouped_df = bod_df.groupby(['TICKER', 'YEAR'])[['TOTAL_SEC', 'OTHCOMP', 
                                                 'NONEQ_INCENT', 'CASH_FEES',                                             'STOCK_AWARDS', 'OPTION_AWARDS']].sum().reset_index()
bod_df = bod_df.merge(grouped_df, on='TICKER', how='left')

bod_df = bod_df.rename(columns={'YEAR_x': 'YEAR'})
bod_df.drop(['YEAR_y'], axis=1)

bod_df = bod_df.rename(columns={'TOTAL_SEC_x': 'TOTAL_SEC'})
bod_df = bod_df.rename(columns={'TOTAL_SEC_y': 'total_director_comp'})

bod_df = bod_df.rename(columns={'OTHCOMP_x': 'OTHCOMP'})
bod_df = bod_df.rename(columns={'OTHCOMP_y': 'total_OTHCOMP'})

bod_df = bod_df.rename(columns={'NONEQ_INCENT_x': 'NONEQ_INCENT'})
bod_df = bod_df.rename(columns={'NONEQ_INCENT_y': 'total_NONEQ_INCENT'})

bod_df = bod_df.rename(columns={'CASH_FEES_x': 'CASH_FEES'})
bod_df = bod_df.rename(columns={'CASH_FEES_y': 'total_CASH_FEES'})

bod_df = bod_df.rename(columns={'STOCK_AWARDS_x': 'STOCK_AWARDS'})
bod_df = bod_df.rename(columns={'STOCK_AWARDS_y': 'total_STOCK_AWARDS'})

bod_df = bod_df.rename(columns={'OPTION_AWARDS_x': 'OPTION_AWARDS'})
bod_df = bod_df.rename(columns={'OPTION_AWARDS_y': 'total_OPTION_AWARDS'})

bod_df = bod_df.drop_duplicates(subset=['TICKER', 'YEAR'], keep='first')

# Working Firms data

Tiny: companies with a market cap of less than $2 billion

Medium: companies with a market cap between 2 billion and 10 billion

Big: companies with a market cap between 10 billion and 100 billion

Huge: companies with a market cap greater than 100 billion

In [7]:
categories = {
    'Small': (0, 10_000),
    'Medium': (10_000, 200_000),
    'Large': (200_000, float('inf'))
}

# create a new column with the market capitalization category for each company

take = firms_df[['tic', 'fyear', 'mkvalt']]
take = take.rename(columns={'tic': 'TICKER'})
take = take.rename(columns={'fyear': 'YEAR'})
take = take.rename(columns={'mkvalt': 'MKVALT'})

In [8]:
ceo_df = ceo_df.merge(take, on = ['TICKER', 'YEAR'], how = 'inner')
bod_df = bod_df.merge(take, on = ['TICKER', 'YEAR'], how = 'inner')

ceo_df['size_category'] = pd.cut(ceo_df['MKVALT'], bins=[categories[cat][0] for cat in categories.keys()] + [float('inf')], labels=categories.keys())
bod_df['size_category'] = pd.cut(bod_df['MKVALT'], bins=[categories[cat][0] for cat in categories.keys()] + [float('inf')], labels=categories.keys())


In [9]:
get_signature1 = lambda row_idx: hashlib.md5(str(ceo_df.iloc[row_idx]).encode('utf-8')).hexdigest()
get_signature2 = lambda row_idx: hashlib.md5(str(bod_df.iloc[row_idx]).encode('utf-8')).hexdigest()

# apply the lambda function to each row of the DataFrame to create a new column of signature indices
ceo_df['signature_index'] = ceo_df.index.to_series().apply(get_signature1)
bod_df['signature_index'] = bod_df.index.to_series().apply(get_signature2)

# Data frames being used in future steps

In [10]:
ceo_df.to_csv('../outputs/CEO_DF.csv', index=False)
bod_df.to_csv('../outputs/BOD_DF.csv', index=False)