In [1]:
import pandas as pd
import numpy as np
from sklearn import set_config
from sklearn.model_selection import train_test_split, KFold
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
from sklearn.impute import SimpleImputer
from df_after_transform import df_after_transform
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SelectPercentile, f_regression
set_config(display="diagram")

In [2]:
ceo_df = pd.read_csv('../outputs/CEO_DF.csv')
bod_df = pd.read_csv('../outputs/BOD_DF.csv')

small_ceo = ceo_df[ceo_df['size_category'] == 'Small']
small_bod = bod_df[bod_df['size_category'] == 'Small']

In [3]:
ceo_df = small_ceo
ceo_df.drop(['GVKEY'], axis=1, inplace=True)
ceo_df = ceo_df.rename(columns={'YEAR':'year', 'TDC1':'tdc1', 'GENDER':'gender'})

In [4]:
c_best_k = 96
c_best_alpha = 0.01

b_best_k = 79
b_best_alpha = 202

In [5]:
# Small CEO regression
# Create boolean masks for the train and holdout periods
train_mask = ceo_df.loc[(ceo_df['year'] >= 2010) & (ceo_df['year'] <= 2016)]
holdout_mask = ceo_df.loc[(ceo_df['year'] >= 2017) & (ceo_df['year'] <= 2019)]

y = np.log(train_mask.tdc1)
train_mask['year'] = train_mask['year'] - 1
X = train_mask.drop('tdc1', axis=1)

VarY = holdout_mask[['signature_index', 'tdc1']]
VarY.to_csv('Saved_dfs/small_ceo_df.csv', index=False)

holdout_set = holdout_mask
holdout_X = holdout_set.drop(['signature_index', 'tdc1'], axis=1)
holdout_X['year'] = holdout_X['year'] - 1

rng = np.random.RandomState(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

missing_cols = set(X_train.columns) - set(holdout_X.columns)
for col in missing_cols:
    holdout_X[col] = 0
holdout_X = holdout_X[X_train.columns]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_mask['year'] = train_mask['year'] - 1


In [6]:
numer_pipe = make_pipeline(SimpleImputer(strategy="mean"), StandardScaler())

cat_pipe = make_pipeline(OneHotEncoder())

preproc_pipe = make_column_transformer(
    (numer_pipe, make_column_selector(dtype_include=np.number)),
    (cat_pipe, ['gender']),
    remainder="drop",
)

In [7]:
opt_pipe = make_pipeline(preproc_pipe, SelectPercentile(score_func=f_regression, percentile=c_best_k), Ridge(alpha=c_best_alpha))
opt_pipe.fit(X_train, y_train)

y_holdout_pred = opt_pipe.predict(holdout_X)

df_out = pd.DataFrame({'signature_index': VarY['signature_index'], 'prediction': np.exp(y_holdout_pred) })
df_out.to_csv('Saved_dfs/pred_small_ceo.csv', index=False)

In [8]:
bod_df = small_bod
bod_df.drop(['GVKEY', 'TOTAL_SEC'], axis=1, inplace=True)
bod_df = bod_df.rename(columns={'YEAR':'year'})

In [9]:
# bod_df.columns

In [10]:
#small BOD regression
# Create boolean masks for the train and holdout periods
train_mask = bod_df.loc[(bod_df['year'] >= 2010) & (bod_df['year'] <= 2016)]
holdout_mask = bod_df.loc[(bod_df['year'] >= 2017) & (bod_df['year'] <= 2019)]

y = np.log(train_mask.total_director_comp)
train_mask['year'] = train_mask['year'] - 1
X = train_mask.drop('total_director_comp', axis=1)

VarY = holdout_mask[['signature_index', 'total_director_comp']]
VarY.to_csv('Saved_dfs/small_bod_df.csv', index=False)

holdout_set = holdout_mask
holdout_X = holdout_set.drop(['signature_index', 'total_director_comp'], axis=1)
holdout_X['year'] = holdout_X['year'] - 1

rng = np.random.RandomState(0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

missing_cols = set(X_train.columns) - set(holdout_X.columns)
for col in missing_cols:
    holdout_X[col] = 0
holdout_X = holdout_X[X_train.columns]

y_train = y_train.replace(-np.inf, 0)


  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_mask['year'] = train_mask['year'] - 1


In [11]:
numer_pipe = make_pipeline(SimpleImputer(strategy="mean"), StandardScaler())

cat_pipe = make_pipeline(OneHotEncoder())

preproc_pipe = make_column_transformer(
    (numer_pipe, make_column_selector(dtype_include=np.number)),
    remainder="drop",
)


In [12]:
opt_pipe = make_pipeline(preproc_pipe, SelectPercentile(score_func=f_regression, percentile=b_best_k), Ridge(alpha=b_best_alpha))
opt_pipe.fit(X_train, y_train)

y_holdout_pred = opt_pipe.predict(holdout_X)

# create a dataframe with the parcel ID and the predicted values
df_out = pd.DataFrame({'signature_index': VarY['signature_index'], 'prediction': np.exp(y_holdout_pred) })
# merged_df = pd.merge(VarY, df_out, on='signature_index')
df_out.to_csv('Saved_dfs/pred_small_bod.csv', index=False)