In [9]:
from ydata_profiling import ProfileReport
import pandas as pd
import numpy as np
import pandas_datareader as pdr  # to install: !pip install pandas_datareader
from datetime import datetime

In [10]:
url = 'https://github.com/LeDataSciFi/data/raw/main/Firm%20Year%20Datasets%20(Compustat)/firms2020.csv'
firms_df = pd.read_csv(url).drop_duplicates('tic')

bod_df = pd.read_csv('./Input_data/DirectorComp.csv')
ceo_df = pd.read_csv('./Input_data/CEOComp.csv')

In [11]:
ceo_df.dropna(axis=1, how='all', inplace=True)
# Calculate the total compensation for the CEO in that year
ceo_df = ceo_df[ceo_df['CEOANN'].notnull()]
ceo_df['TDC1'].describe() #only difference between TDC1 and TDC2 is the options preferences. TDC1 is bk value of options (Black- Scholes equation)

count      4811.000000
mean      11467.801272
std       10203.831261
min           0.001000
25%        6158.732500
50%        9658.115000
75%       14350.184000
max      280621.551000
Name: TDC1, dtype: float64

In [12]:
# ceo_df = ceo_df.drop(['CFOANN', 'PENSION_CHG', 'TOTAL_SEC', 'CHG_CTRL_PYMT', 'TOTAL_SEC_PCT', 'REPRICE', 'EXECRANK'], axis=1)
######trimmed this down to make it easier for regressions. We can look back if I missed any. 
ceo_df = ceo_df[['GVKEY', 'TICKER', 'CEOANN', 'SALARY', 'BONUS', 'STOCK_AWARDS', 'OPTION_AWARDS',
                 'OTHCOMP', 'TOTAL_CURR', 'AGE', 'TDC1', 'YEAR', 'BECAMECEO', 'JOINED_CO',
                 'LEFTOFC', 'GENDER', 'CUSIP', 'NAICSDESC', 'TICKER', 'SIC']] 
ceo_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4811 entries, 0 to 26762
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   GVKEY          4811 non-null   int64  
 1   TICKER         4811 non-null   object 
 2   CEOANN         4811 non-null   object 
 3   SALARY         4811 non-null   float64
 4   BONUS          4811 non-null   float64
 5   STOCK_AWARDS   4811 non-null   float64
 6   OPTION_AWARDS  4811 non-null   float64
 7   OTHCOMP        4811 non-null   float64
 8   TOTAL_CURR     4811 non-null   float64
 9   AGE            4811 non-null   float64
 10  TDC1           4811 non-null   float64
 11  YEAR           4811 non-null   int64  
 12  BECAMECEO      4778 non-null   object 
 13  JOINED_CO      1307 non-null   object 
 14  LEFTOFC        3063 non-null   object 
 15  GENDER         4811 non-null   object 
 16  CUSIP          4811 non-null   object 
 17  NAICSDESC      4811 non-null   object 
 18  TICKER 

In [13]:
bod_df = bod_df[['GVKEY', 'TICKER', 'TOTAL_SEC', 'OTHCOMP', 'NONEQ_INCENT', 'CASH_FEES', 'STOCK_AWARDS', 'OPTION_AWARDS',
                 'SPCODE', 'CUSIP', 'CONAME', 'YEAR']]

In [14]:
grouped_df = bod_df.groupby(['TICKER', 'YEAR'])[['TOTAL_SEC', 'OTHCOMP', 
                                                 'NONEQ_INCENT', 'CASH_FEES', 
                                                 'STOCK_AWARDS', 'OPTION_AWARDS']].sum().reset_index()

bod_df = bod_df.merge(grouped_df, on='TICKER', how='left')

bod_df = bod_df.rename(columns={'YEAR_x': 'YEAR'})
bod_df.drop(['YEAR_y'], axis=1)

bod_df = bod_df.rename(columns={'TOTAL_SEC_x': 'TOTAL_SEC'})
bod_df = bod_df.rename(columns={'TOTAL_SEC_y': 'total_director_comp'})

bod_df = bod_df.rename(columns={'OTHCOMP_x': 'OTHCOMP'})
bod_df = bod_df.rename(columns={'OTHCOMP_y': 'total_OTHCOMP'})

bod_df = bod_df.rename(columns={'NONEQ_INCENT_x': 'NONEQ_INCENT'})
bod_df = bod_df.rename(columns={'NONEQ_INCENT_y': 'total_NONEQ_INCENT'})

bod_df = bod_df.rename(columns={'CASH_FEES_x': 'CASH_FEES'})
bod_df = bod_df.rename(columns={'CASH_FEES_y': 'total_CASH_FEES'})

bod_df = bod_df.rename(columns={'STOCK_AWARDS_x': 'STOCK_AWARDS'})
bod_df = bod_df.rename(columns={'STOCK_AWARDS_y': 'total_STOCK_AWARDS'})

bod_df = bod_df.rename(columns={'OPTION_AWARDS_x': 'OPTION_AWARDS'})
bod_df = bod_df.rename(columns={'OPTION_AWARDS_y': 'total_OPTION_AWARDS'})

In [19]:
# bod_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 486124 entries, 0 to 486123
Data columns (total 19 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   GVKEY                486124 non-null  int64  
 1   TICKER               486124 non-null  object 
 2   TOTAL_SEC            486124 non-null  float64
 3   OTHCOMP              486124 non-null  float64
 4   NONEQ_INCENT         486124 non-null  float64
 5   CASH_FEES            486124 non-null  float64
 6   STOCK_AWARDS         486124 non-null  float64
 7   OPTION_AWARDS        486124 non-null  float64
 8   SPCODE               486124 non-null  object 
 9   CUSIP                486124 non-null  object 
 10  CONAME               486124 non-null  object 
 11  YEAR                 486124 non-null  int64  
 12  YEAR_y               486124 non-null  int64  
 13  total_director_comp  486124 non-null  float64
 14  total_OTHCOMP        486124 non-null  float64
 15  total_NONEQ_INCEN

In [20]:
unique_tickers = bod_df.drop_duplicates(subset=['TICKER', 'YEAR'], keep='first')

unique_tickers.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4792 entries, 0 to 486082
Data columns (total 19 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   GVKEY                4792 non-null   int64  
 1   TICKER               4792 non-null   object 
 2   TOTAL_SEC            4792 non-null   float64
 3   OTHCOMP              4792 non-null   float64
 4   NONEQ_INCENT         4792 non-null   float64
 5   CASH_FEES            4792 non-null   float64
 6   STOCK_AWARDS         4792 non-null   float64
 7   OPTION_AWARDS        4792 non-null   float64
 8   SPCODE               4792 non-null   object 
 9   CUSIP                4792 non-null   object 
 10  CONAME               4792 non-null   object 
 11  YEAR                 4792 non-null   int64  
 12  YEAR_y               4792 non-null   int64  
 13  total_director_comp  4792 non-null   float64
 14  total_OTHCOMP        4792 non-null   float64
 15  total_NONEQ_INCENT   4792 non-null  