In [1]:
import pandas as pd
import numpy as np
from sklearn import set_config
from sklearn.model_selection import train_test_split, KFold
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
from sklearn.impute import SimpleImputer
from df_after_transform import df_after_transform
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SelectPercentile, f_regression
set_config(display="diagram")

In [2]:
ceo_df = pd.read_csv('../outputs/CEO_DF.csv')
bod_df = pd.read_csv('../outputs/BOD_DF.csv')

small_ceo = ceo_df[ceo_df['size_category'] == 'Small']
small_bod = bod_df[bod_df['size_category'] == 'Small']

In [3]:
small_bod.describe()

Unnamed: 0,GVKEY,TOTAL_SEC,OTHCOMP,NONEQ_INCENT,CASH_FEES,STOCK_AWARDS,OPTION_AWARDS,YEAR,YEAR_y,total_director_comp,total_OTHCOMP,total_NONEQ_INCENT,total_CASH_FEES,total_STOCK_AWARDS,total_OPTION_AWARDS,MKVALT
count,1391.0,1391.0,1391.0,1391.0,1391.0,1391.0,1391.0,1391.0,1391.0,1391.0,1391.0,1391.0,1391.0,1391.0,1391.0,1391.0
mean,59500.416247,234.340068,17.283288,0.462114,81.45622,106.519724,28.344439,2013.358016,2010.169662,1740.908252,108.294786,5.586007,596.187106,699.99073,316.654201,5996.205958
std,66289.233703,284.361777,249.244787,8.371065,58.228101,109.46888,86.096147,2.649244,0.830168,2020.31845,502.075775,88.575741,366.754493,1016.424766,1028.238249,2431.135393
min,1045.0,0.0,0.0,0.0,0.0,0.0,0.0,2010.0,2010.0,0.0,0.0,0.0,0.0,0.0,0.0,62.8917
25%,9203.0,164.7685,0.0,0.0,57.0,49.7485,0.0,2011.0,2010.0,1073.4,0.0,0.0,351.333,186.3295,0.0,4126.05445
50%,24731.0,215.009,0.0,0.0,78.75,100.01,0.0,2013.0,2010.0,1458.19,0.0,0.0,553.941,600.08,0.0,6163.617
75%,126296.0,267.7825,0.0,0.0,100.017,139.982,0.0,2015.0,2010.0,1989.438,39.591,0.0,808.204,958.17,346.824,8039.6325
max,316056.0,7733.603,7688.89,275.0,941.186,2575.027,1477.974,2019.0,2018.0,24450.872,5667.183,3025.0,3948.455,11953.109,11948.963,9984.5348


In [4]:
ceo_df = small_ceo
ceo_df.drop(['GVKEY'], axis=1, inplace=True)
ceo_df = ceo_df.rename(columns={'YEAR':'year', 'TDC1':'tdc1', 'GENDER':'gender'})

In [5]:
c_best_k = 96
c_best_alpha = 0.001 # old 0.01

b_best_k = 79 #64
b_best_alpha = 202 #118

In [6]:
# Small CEO regression
# Create boolean masks for the train and holdout periods
train_mask = ceo_df.loc[(ceo_df['year'] >= 2010) & (ceo_df['year'] <= 2016)]
holdout_mask = ceo_df.loc[(ceo_df['year'] >= 2017) & (ceo_df['year'] <= 2019)]

y = np.log(train_mask.tdc1)
train_mask['year'] = train_mask['year'] - 1
X = train_mask.drop('tdc1', axis=1)

VarY = holdout_mask[['signature_index', 'tdc1']]
VarY.to_csv('Saved_dfs/small_ceo_df.csv', index=False)

holdout_set = holdout_mask
holdout_X = holdout_set.drop(['signature_index', 'tdc1'], axis=1)
holdout_X['year'] = holdout_X['year'] - 1

rng = np.random.RandomState(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

missing_cols = set(X_train.columns) - set(holdout_X.columns)
for col in missing_cols:
    holdout_X[col] = 0
holdout_X = holdout_X[X_train.columns]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_mask['year'] = train_mask['year'] - 1


In [7]:
numer_pipe = make_pipeline(SimpleImputer(strategy="mean"), StandardScaler())

cat_pipe = make_pipeline(OneHotEncoder())

preproc_pipe = make_column_transformer(
    (numer_pipe, make_column_selector(dtype_include=np.number)),
    (cat_pipe, ['gender']),
    remainder="drop",
)

In [8]:
opt_pipe = make_pipeline(preproc_pipe, SelectPercentile(score_func=f_regression, percentile=c_best_k), Ridge(alpha=c_best_alpha))
opt_pipe.fit(X_train, y_train)

y_holdout_pred = opt_pipe.predict(holdout_X)

df_out = pd.DataFrame({'signature_index': VarY['signature_index'], 'prediction': np.exp(y_holdout_pred) })
df_out.to_csv('Saved_dfs/pred_small_ceo.csv', index=False)

In [9]:
bod_df = small_bod
bod_df.drop(['GVKEY', 'TOTAL_SEC'], axis=1, inplace=True)
bod_df = bod_df.rename(columns={'YEAR':'year'})

In [10]:
# bod_df.columns

In [11]:
#small BOD regression
# Create boolean masks for the train and holdout periods
train_mask = bod_df.loc[(bod_df['year'] >= 2010) & (bod_df['year'] <= 2016)]
holdout_mask = bod_df.loc[(bod_df['year'] >= 2017) & (bod_df['year'] <= 2019)]

y = np.log(train_mask.total_director_comp)
train_mask['year'] = train_mask['year'] - 1
X = train_mask.drop('total_director_comp', axis=1)

VarY = holdout_mask[['signature_index', 'total_director_comp']]
VarY.to_csv('Saved_dfs/small_bod_df.csv', index=False)

holdout_set = holdout_mask
holdout_X = holdout_set.drop(['signature_index', 'total_director_comp'], axis=1)
holdout_X['year'] = holdout_X['year'] - 1

rng = np.random.RandomState(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

missing_cols = set(X_train.columns) - set(holdout_X.columns)
for col in missing_cols:
    holdout_X[col] = 0
holdout_X = holdout_X[X_train.columns]

y_train = y_train.replace(-np.inf, 0)


  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_mask['year'] = train_mask['year'] - 1


In [12]:
numer_pipe = make_pipeline(SimpleImputer(strategy="mean"), StandardScaler())

cat_pipe = make_pipeline(OneHotEncoder())

preproc_pipe = make_column_transformer(
    (numer_pipe, make_column_selector(dtype_include=np.number)),
    remainder="drop",
)


In [13]:
opt_pipe = make_pipeline(preproc_pipe, SelectPercentile(score_func=f_regression, percentile=b_best_k), Ridge(alpha=b_best_alpha))
opt_pipe.fit(X_train, y_train)

y_holdout_pred = opt_pipe.predict(holdout_X)

# create a dataframe with the parcel ID and the predicted values
df_out = pd.DataFrame({'signature_index': VarY['signature_index'], 'prediction': np.exp(y_holdout_pred) })
# merged_df = pd.merge(VarY, df_out, on='signature_index')
df_out.to_csv('Saved_dfs/pred_small_bod.csv', index=False)