In [None]:
import os as os
import warnings as warnings
import numpy as np
import scipy as sp
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
import plotnine as gg

import sklearn as sk
import statsmodels.api as sm

from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Imputer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble.partial_dependence import plot_partial_dependence

from pgmpy.models import BayesianModel
from pgmpy.estimators import MaximumLikelihoodEstimator, BayesianEstimator

pd.options.display.max_rows = 1000
gg.theme_set(gg.theme_bw())

In [None]:
df = pd.read_csv('../data/engineered_factset_campaign_data.csv', parse_dates=['campaign_announcement_date'])

df_objective_mapping = pd.read_csv('../mapping/campaign_mapping.csv')
df_value_mapping = pd.read_csv('../mapping/value_demand_mapping.csv')
df_governance_mapping = pd.read_csv('../mapping/governance_demand_mapping.csv')
df_result_mapping = pd.read_csv('../mapping/proxy_result_mapping.csv')

df = (
    df
    .assign(has_value_demand=lambda df: 1 * df.value_demand.notnull())
    .assign(has_governance_demand=lambda df: 1 * df.governance_demand.notnull())
    .assign(glass_lewis_support_indicator=lambda df: 1 * (df.glass_lewis_support == 'Management').fillna(0))
    .assign(iss_support_indicator=lambda df: 1 * (df.iss_support == 'Management').fillna(0))
    .assign(independent_support_indicator=lambda df: 1 * (df.glass_lewis_support_indicator | df.iss_support_indicator))
    .assign(poison_pill_indicator=lambda df: 1 * (df.poison_pill_in_force_prior_to_announcement == "Yes").fillna(0))
    .assign(poison_pill_adopted_indicator=lambda df: 1 * (df.poison_pill_adopted_in_response_to_campaign == "Yes").fillna(0))
    .assign(cumulative_6m_residual_return=lambda df: df.cumulative_6m_residual_return.clip(-0.30, 0.30))
    .assign(ownership_pecent_on_announcement=lambda df: df.ownership_pecent_on_announcement.fillna(0).clip(0, 0.10))
    .assign(campaign_return=lambda df: df.cumulative_6m_residual_return)
)

df = (
    df
    .assign(board_seats_percentage_sought=lambda df: np.where(
        df.total_number_of_board_seats > 0,
        (df.number_of_board_seats_sought / df.total_number_of_board_seats).fillna(0),
        np.nan
    ))
    .assign(board_seat_percentage_gained=lambda df: np.where(
        df.number_of_board_seats_sought > 0,
        df.number_of_board_seats_gained.fillna(0) / df.number_of_board_seats_sought,
        np.nan
    ))
    .assign(board_seat_result_group = lambda df: np.select(
        [
            df.board_seat_percentage_gained >= 1,
            df.board_seat_percentage_gained > 0,
            df.board_seat_percentage_gained <= 0,
            df.board_seat_percentage_gained.isnull(),
        ],
        [
            'Dissident',
            'Dissident',
            'Management',
            None
        ]
    ))
)

df = (
    df
    .pipe(pd.merge, df_objective_mapping, how='left', on='campaign_objective_primary')
    .pipe(pd.merge, df_value_mapping, how='left', on='value_demand')
    .pipe(pd.merge, df_governance_mapping, how='left', on='governance_demand')
    .pipe(pd.merge, df_result_mapping, how='left', on='proxy_campaign_winner_or_result')
)

In [None]:
df = pd.get_dummies(
    df,
    columns=['campaign_objective_group'],
    prefix='campaign_objective',
    prefix_sep='=',
    drop_first=True,
    dummy_na=False
)

df = pd.get_dummies(
    df,
    columns=['value_demand_group'],
    prefix='value_demand',
    prefix_sep='=',
    drop_first=True,
    dummy_na=False
)

df = pd.get_dummies(
    df,
    columns=['governance_demand_group'],
    prefix='governance_demand',
    prefix_sep='=',
    drop_first=True,
    dummy_na=False
)

df = pd.get_dummies(
    df,
    columns=['proxy_result_group'],
    prefix='proxy_result',
    prefix_sep='=',
    drop_first=True,
    dummy_na=False
)

df = pd.get_dummies(
    df,
    columns=['board_seat_result_group'],
    prefix='board_result',
    prefix_sep='=',
    drop_first=True,
    dummy_na=False
)

In [None]:
df.board_seat_percentage_gained.notnull().sum()

In [None]:
df.head(1)

In [None]:
df.proxy_result_code.value_counts(dropna=False)

In [None]:
df.pre_12m_dividends.describe()

In [None]:
df.hist(column='pre_12m_dividends', figsize=(12, 8))

# Linear Regression

In [None]:
x_columns = (
    [
        'ownership_pecent_on_announcement',
        'past_return_successes',
        'independent_support_indicator'
    ] +
    [c for c in df.columns.tolist() if 'used_' in c] +
    [c for c in df.columns.tolist() if 'campaign_objective=' in c] +
    [c for c in df.columns.tolist() if 'value_demand=' in c] +
    [c for c in df.columns.tolist() if 'governance_demand=' in c] +
    [c for c in df.columns.tolist() if 'proxy_result=' in c] +
    [c for c in df.columns.tolist() if 'board_result=' in c] +
    [
        # target
        'total_number_of_board_seats',
        'board_seats_percentage_sought',
        'poison_pill_indicator',
        'poison_pill_adopted_indicator',
        'pre_12m_earnings_yield',
        'beta'
    ]
)

y_column = [
    'campaign_return'
]

df_lm = df.dropna(subset=y_column)

df_train = df_lm[df_lm.campaign_announcement_date <= '2016-12-31']
df_test = df_lm[df_lm.campaign_announcement_date >= '2017-01-01']

df_full = df_lm[x_columns + y_column]
X_train, y_train = df_train[x_columns], df_train[y_column]
X_test, y_test = df_test[x_columns], df_test[y_column]

n_samples, n_features = X_train.shape

In [None]:
print(f"Full data set has {len(df)} observations")
print(f"Regression data set has {len(df_full)} observations")
print("Train Feature shape: {}".format(X_train.shape))
print("Train Target shape: {}".format(y_train.shape))
print("Test Feature shape: {}".format(X_test.shape))
print("Test Target shape: {}".format(y_test.shape))

In [None]:
df_full.dtypes

In [None]:
df_full.isnull().sum()

In [None]:
(df_full == np.inf).sum()

In [None]:
X_train.head()

In [None]:
y_train.head()

# Linear Regression

### statmodels

In [None]:
y_train_clean, X_train_clean = y_train[X_train.notnull().all(axis='columns')], X_train[X_train.notnull().all(axis='columns')]
sm.OLS(y_train_clean, sm.add_constant(X_train_clean)).fit().summary()

### naive scikit

In [None]:
categorical_features = [c for c in df_full.dtypes[df_full.dtypes != float].index.tolist() if c not in y_column]
numeric_features = [c for c in df_full.dtypes[df_full.dtypes == float].index.tolist() if c not in y_column]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

preprocessor = ColumnTransformer(transformers=[
    ('numeric', numeric_transformer, numeric_features),
    ('categorical', categorical_transformer, categorical_features)
])

model = Pipeline([
    ("preprocessor", preprocessor),
    ("model", LinearRegression())
])

fitted_model = model.fit(X_train, y_train)
y_predicted = fitted_model.predict(X_test)
df_predicted = df_test.assign(predicted_campaign_return=y_predicted)

In [None]:
r2_train = fitted_model.score(X_train, y_train)
r2_test = fitted_model.score(X_test, y_test)
print(f"Training set score is {r2_train}")
print(f"Test set score is {r2_test}")

In [None]:
(
    gg.ggplot(df_predicted) +
    gg.geom_point(gg.aes(x='predicted_campaign_return', y='campaign_return')) +
    gg.theme(axis_text=gg.element_text(size=10, rotation=90), figure_size=(12, 6)) +
    gg.labs(
        title = "Predicted vs. Realized"
    )
)

In [None]:
df_coefficients = (
    pd.DataFrame({
    'variable': X_train.columns.tolist(),
    'coefficient': model.named_steps['model'].coef_.flatten().tolist(),
    })
    .sort_values('coefficient')
    .assign(variable=lambda df: pd.Categorical(df.variable, categories=df.variable.tolist()))
)

(
    gg.ggplot(df_coefficients) +
    gg.geom_bar(gg.aes(x='variable', y='coefficient'), stat='identity', position='dodge') +
    gg.theme(axis_text=gg.element_text(size=8, rotation=0), figure_size=(6, 12)) +
    gg.coord_flip() +
    gg.labs(
        title = "Coefficients"
    )
)

### proper scikit

In [None]:
categorical_features = [c for c in df_full.dtypes[df_full.dtypes != float].index.tolist() if c not in y_column]
numeric_features = [c for c in df_full.dtypes[df_full.dtypes == float].index.tolist() if c not in y_column]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

preprocessor = ColumnTransformer(transformers=[
    ('numeric', numeric_transformer, numeric_features),
    ('categorical', categorical_transformer, categorical_features)
])

dict_model_pipelines = {
    "Model 1 - Linear Regression": [
        ("preprocessor", preprocessor),
        ("model", LinearRegression())
    ],
    "Model 2 - Ridge Regression": [
        ("preprocessor", preprocessor),
        ("model", Ridge())
    ],
    "Model 3 - Lasso Regression": [
        ("preprocessor", preprocessor),
        ("model", Lasso())
    ]
}

dict_model_parameter_grids = {
    "Model 1 - Linear Regression": {},
    "Model 2 - Ridge Regression": {
        'model__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]
    },
    "Model 3 - Lasso Regression": {
        'model__alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]
    }
}

dict_grid_search_params = {
    "scoring": 'r2',
    "cv": 5,
    "refit": True,
    "iid": True
}

dict_cv_params = {
    "scoring": 'r2',
    "cv": 5
}

In [None]:
dict_models_baseline = {}
dict_grids = {}
dict_models_optimized = {}
dict_scores = {}
dict_train_predicted = {}
dict_test_predicted = {}

for model_name, model_pipeline_parameters in dict_model_pipelines.items():
    
    print('running model {}...'.format(model_name))
    print('=' * 100)
    print('')
    
    # model parameters
    
    pipeline_baseline_parameters = dict_model_pipelines[model_name].copy()
    grid_search_parameters = dict_model_parameter_grids[model_name].copy()
    pipeline_optimized_parameters = dict_model_pipelines[model_name].copy()  # will be updated
    
    # baseline fit on full training data
    
    model_pipeline_basline = Pipeline(pipeline_baseline_parameters)
    model_pipeline_basline = model_pipeline_basline.fit(X_train, y_train)

    params_baseline = model_pipeline_basline.get_params(deep=True)
    
    score_baseline_train = model_pipeline_basline.score(X_train, y_train)
    score_baseline_validation = np.mean(cross_val_score(model_pipeline_basline, X_train, y_train, **dict_cv_params))
    score_baseline_test = model_pipeline_basline.score(X_test, y_test)
    
    print('simple fit on training sample')
    print('-' * 80)
    print('')
    print('full-sample baseline parameters: {}'.format(params_baseline['model']))
    print('full-sample baseline train score: {}'.format(score_baseline_train))
    print('full-sample baseline validation score: {}'.format(score_baseline_validation))
    print('full-sample baseline test score: {}'.format(score_baseline_test))
    print('')
    
    # grid fit on cv-samples of training data

    model_grid_pipeline = Pipeline(pipeline_baseline_parameters)
    model_grid = GridSearchCV(model_grid_pipeline, param_grid=grid_search_parameters, **dict_grid_search_params)
    model_grid = model_grid.fit(X_train, y_train)
    
    params_grid_best = model_grid.best_params_.copy()
    
    score_grid_train = model_grid.score(X_train, y_train)
    score_grid_validation = np.mean(cross_val_score(model_grid, X_train, y_train, **dict_cv_params))
    score_grid_test = model_grid.score(X_test, y_test)

    print('cv fit on training sample')
    print('-' * 80)
    print('')
    print('grid-search-cv best parameters: {}'.format(params_grid_best))
    print('grid-search-cv best train score: {}'.format(score_grid_train))
    print('grid-search-cv best validation score: {}'.format(score_grid_validation))
    print('grid-search-cv best test score: {}'.format(score_grid_test))
    print('')
    
    # optimized fit on full training data
    
    model_pipeline_optimized = Pipeline(pipeline_optimized_parameters)
    model_pipeline_optimized.set_params(**params_grid_best)
    model_pipeline_optimized = model_pipeline_optimized.fit(X_train, y_train)

    params_optimized = model_pipeline_optimized.get_params(deep=True)
    
    score_optimized_train = model_pipeline_optimized.score(X_train, y_train)
    score_optimized_validation = np.mean(cross_val_score(model_pipeline_optimized, X_train, y_train, **dict_cv_params))
    score_optimized_test = model_pipeline_optimized.score(X_test, y_test)
    
    print('optimized fit on training sample')
    print('-' * 80)
    print('')
    print('full-sample optimized parameters: {}'.format(params_optimized['model']))
    print('full-sample optimized train score: {}'.format(score_optimized_train))
    print('full-sample optimized validation score: {}'.format(score_optimized_validation))
    print('full-sample optimized test score: {}'.format(score_optimized_test))
    print('')
    
    # final prediction
    
    y_train_predicted = model_pipeline_optimized.predict(X_train)
    y_test_predicted = model_pipeline_optimized.predict(X_test)
    
    df_train_predicted = df_train.assign(predicted_campaign_return=y_train_predicted).assign(model_name=model_name)
    df_test_predicted = df_test.assign(predicted_campaign_return=y_test_predicted).assign(model_name=model_name)

    # store
    dict_models_baseline[model_name] = model_pipeline_basline
    dict_grids[model_name] = model_grid.cv_results_
    dict_models_optimized[model_name] = model_pipeline_optimized
    dict_scores[model_name] = pd.DataFrame(
        columns=['IsTuned', 'Sample', 'Score'],
        data={
            'IsTuned': [
                False, False, False,
                True, True, True
            ],
            'Sample': [
                'Train', 'Validation', 'Test',
                'Train', 'Validation', 'Test'
            ],
            'Score': [
                score_baseline_train, score_baseline_validation, score_baseline_test,
                score_optimized_train, score_optimized_validation, score_optimized_test
            ]
        },
    ).set_index(['IsTuned', 'Sample'])
    dict_train_predicted[model_name] = df_train_predicted
    dict_test_predicted[model_name] = df_test_predicted

In [None]:
df_scores = pd.concat(dict_scores, names=['Model', 'IsTuned', 'Sample']).unstack('IsTuned')
df_train_predictions = pd.concat(dict_train_predicted.values())
df_test_predictions = pd.concat(dict_test_predicted.values())

In [None]:
(
    gg.ggplot(df_train_predictions) +
    gg.geom_point(gg.aes(x='predicted_campaign_return', y='campaign_return', color='model_name')) +
    gg.theme(axis_text=gg.element_text(size=10, rotation=90), figure_size=(12, 6)) +
    gg.labs(
        title = "Predicted vs. Realized"
    )
)

In [None]:
df_coefficients = (
    pd.concat({
        model_name: pd.DataFrame({
            'variable': X_train.columns.tolist(),
            'coefficient': model.named_steps['model'].coef_.flatten().tolist()
        })
        for model_name, model in dict_models_optimized.items()
    }, names=['model_name', 'variable_id'])
    .reset_index()
    .sort_values(['model_name', 'coefficient'])
    .assign(variable=lambda df: pd.Categorical(df.variable, categories=df.variable.unique().tolist()))
)

(
    gg.ggplot(df_coefficients) +
    gg.geom_bar(gg.aes(x='variable', y='coefficient', fill='model_name'), stat='identity', position='dodge') +
    gg.theme(axis_text=gg.element_text(size=8, rotation=0), figure_size=(6, 12)) +
    gg.coord_flip() +
    gg.labs(
        title = "Coefficients"
    )
)

# Graphical Model

In [None]:
df.dtypes

In [None]:
df_pgm_data = (
    df
    .assign(campaign_objective_group=lambda df: df.campaign_objective_group.fillna('(Missing)'))
    .assign(value_demand_group=lambda df: df.value_demand_group.fillna('(Missing)'))
    .assign(governance_demand_group=lambda df: df.governance_demand_group.fillna('(Missing)'))
    .assign(ownership_exceeds_5_indicator=lambda df: 1 * (df.ownership_pecent_on_announcement > 0.05).fillna(0))
    .assign(campaign_outcome_is_management=lambda df: 1 * ((df.proxy_result_group == "Management") | (df.board_seat_result_group == "Management")))
    .assign(campaign_return=lambda df: df.cumulative_6m_residual_return)
    .assign(campaign_return_is_positive=lambda df: 1 * (df.campaign_return > 0))
)

In [None]:
x_columns = (
    [
        'ownership_exceeds_5_indicator',
        'poison_pill_indicator',
        'poison_pill_adopted_indicator',
        'independent_support_indicator'
    ] 
)

y1_column = ['campaign_outcome_is_management']
y2_column = ['campaign_return_is_positive']

all_columns = x_columns + y1_column + y2_column

df_train = df_pgm_data[df_pgm_data.campaign_announcement_date <= '2016-12-31'].dropna(subset=y2_column).loc[:, all_columns]
df_test = df_pgm_data[df_pgm_data.campaign_announcement_date >= '2017-01-01'].dropna(subset=y2_column).loc[:, x_columns + y1_column]

In [None]:
df_train.isnull().sum()

In [None]:
df_train.head()

In [None]:
df_pgm_data.groupby(['campaign_outcome', 'campaign_return_bin']).campaign_id.count()

In [None]:
df_pgm_data.groupby(['campaign_outcome', 'campaign_return_bin']).campaign_return.mean()

In [None]:
model = BayesianModel(
    [(x_column, y1_column[0]) for x_column in x_columns] +
    [(y1_column[0], y2_column[0])]
)

model.fit(
    df_train,
    estimator=BayesianEstimator,
    complete_samples_only=True
)

In [None]:
for cpd in model.get_cpds():
    print("CPD of {variable}:".format(variable=cpd.variable))
    print(cpd)

In [None]:
model.nodes()

In [None]:
model.edges()

In [None]:
# model.get_independencies()

In [None]:
df_test_sample = df_test.sample(5)
df_test_sample

In [None]:
y_predicted = model.predict(df_test_sample)
df_test_predicted = df_test_sample.assign(campaign_return_predicted=y_predicted)
df_test_predicted.head()

In [None]:
df_test_predicted_probabilities = model.predict_probability(df_test_sample)
df_test_predicted_probabilities

In [None]:
import networkx as nx
import pylab as plt

plt.figure(figsize=(12, 12))
nx.draw(model, with_labels=True)
plt.show()