In [180]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [145]:
data = pd.read_csv('clean_factset_campaign_data.csv')

In [146]:
data.head()

Unnamed: 0,campaign_id,campaign_announcement_date,campaign_title,campaign_objective_primary,value_demand,governance_demand,activist_campaign_tactic,activist_campaign_results,total_number_of_board_seats,number_of_board_seats_sought,...,6_months_post_announcement_date,1_year_post_announcement_date,18_months_post_announcement_date,18_months_pre_date_total_return,1_year_pre_date_total_return,6_months_pre_date_total_return,90_days_pre_date_total_return,6_months_post_date_total_return,1_year_post_date_total_return,18_months_post_date_total_return
0,0000054704C,2007-09-24,"Catalytica Energy Systems, Inc. / AWM Investme...",Vote/Activism Against a Merger,Block Acquisition/Agitate for Lower Price (Sha...,,Publicly Disclosed Letter to Board/Management,Campaign to vote against proposed merger with ...,0,0,...,2008-03-24,2008-09-24,2009-03-24,,,,,,,
1,0000396364C,2008-01-22,"Circuit City Stores, Inc. / Wattles Capital Ma...",Board Control,Review Strategic Alternatives,Remove Director(s),"Propose Binding Proposal, Publicly Disclosed L...",Proxy fight to remove and replace the board se...,12,5,...,2008-07-22,2009-01-22,2009-07-22,-0.007917,-0.007622,-0.006603,-0.00419,-0.004637,-0.009898,-0.009892
2,0000411278C,2012-05-29,Reading International Inc. / Capstone Equities...,Maximize Shareholder Value,"Breakup Company, Divest Assets/Divisions",Other Governance Enhancements,Publicly Disclosed Letter to Board/Management,"Capstone urged a breakup, saying that a sum-of...",0,0,...,2012-11-29,2013-05-29,2013-11-29,0.001575,0.002124,0.004033,0.002952,-0.000323,0.000459,0.002398
3,0000556550C,2008-03-24,"Coinstar, Inc. / Shamrock Partners Activist Va...",Board Representation,,Other Governance Enhancements,"Nominate Slate of Directors, Letter to Stockho...",Proxy fight for 3 of 7 seats settled in exchan...,7,3,...,2008-09-24,2009-03-24,2009-09-24,0.000496,-0.000649,-0.001156,-7e-05,0.001723,0.000214,0.001033
4,0000719478C,2011-08-05,"Arch Chemicals, Inc. / GAMCO Investors",13D Filer - No Publicly Disclosed Activism,,,,13D Filer - No Publicly Disclosed Activism,0,0,...,2012-02-05,2012-08-05,2013-02-05,0.006649,0.003445,0.003305,0.003133,,,


# 1. Data Processing

## 1.1 Scale Return

In [174]:
## Correction
for i in data.columns[-7:]:
    data[i] = data[i].apply(lambda x : 100 * x)

## 1.2 X Variables

In [151]:
## X Variables Category
Target_Variables = ['price_at_announcement', 'ltm_eps_at_announcement', 
                    'earnings_yield_at_announcement', '18_months_pre_date_total_return', 
                   '1_year_pre_date_total_return', '6_months_pre_date_total_return',
                   '90_days_pre_date_total_return', 'sector']
Activist_Variables = ['activist_id', 'ownership_pecent_on_announcements']
X_Variables = Target_Variables + Activist_Variables

**Since we are predicting the campaign objective, we can only use variables that is finalized by the time activists approaching the target companies. So we remove variables like 'in_force_prior_to_announcement_poison_pill', 'adopted_in_response_to_campaign_poison_pill', 'activist_campaign_tactic' and Campaign related variables.**

In [152]:
## X Variables Type
Categorical_Variables = ['sector', 'activist_id']
Continuous_Variables = [i for i in X_Variables if i not in Categorical_Variables]

## 1.3 Y Variable

In [153]:
data.campaign_objective_primary.value_counts() / len(data)

Board Representation                                        0.222652
Maximize Shareholder Value                                  0.217114
13D Filer - No Publicly Disclosed Activism                  0.131230
Vote For a Stockholder Proposal                             0.105423
Board Control                                               0.082541
Vote/Activism Against a Merger                              0.058719
Hostile/Unsolicited Acquisition                             0.058092
Vote Against a Management Proposal                          0.037823
Enhance Corporate Governance                                0.028210
Public Short Position/Bear Raid                             0.018284
Support Dissident Group in Proxy Fight                      0.013896
Vote For a Management Proposal/Support Management           0.010762
Remove Director(s), No Dissident Nominee to Fill Vacancy    0.010135
Remove Officer(s)                                           0.005120
Name: campaign_objective_primary, 

In [204]:
data_processed.campaign_objective_primary.value_counts()

Board Seats                                   2921
Maximize Shareholder Value                    2078
13D Filer - No Publicly Disclosed Activism    1256
Vote For a Stockholder Proposal               1009
Name: campaign_objective_primary, dtype: int64

In [211]:
Campaign_Objective

['Board Seats',
 'Maximize Shareholder Value',
 '13D Filer - No Publicly Disclosed Activism',
 'Vote For a Stockholder Proposal',
 'Vote/Activism Against a Merger',
 'Hostile/Unsolicited Acquisition']

In [212]:
## Y Variable
## Remove if the campaign objective accounts for less than 6%
Campaign_Objectives = data.campaign_objective_primary.value_counts().index[:7]
data_processed = data[data.campaign_objective_primary.isin(Campaign_Objectives)] ## data size decrease from 9571 to 7264
data_processed = data_processed[X_Variables + ['campaign_objective_primary']]
## Merge Board Representation & Board Control
data_processed.campaign_objective_primary = data_processed.campaign_objective_primary.apply(
    lambda x: x if x not in ['Board Representation', 'Board Control'] else 'Board Seats')
Campaign_Objective = ['Board Seats'] + list(Campaign_Objectives)[1:4] + list(Campaign_Objectives)[5:]

* **Campaign Objectives:**
    - Board Representation                            
    - Maximize Shareholder Value                    
    - 13D Filer - No Publicly Disclosed Activism    
    - Vote For a Stockholder Proposal              
    - Board Control                              
    - Vote/Activism Against a Merger              
    - Hostile/Unsolicited Acquisition          

In [213]:
data_processed.head()

Unnamed: 0,price_at_announcement,ltm_eps_at_announcement,earnings_yield_at_announcement,18_months_pre_date_total_return,1_year_pre_date_total_return,6_months_pre_date_total_return,90_days_pre_date_total_return,sector,activist_id,ownership_pecent_on_announcements,campaign_objective_primary
0,,,,,,,,Producer Manufacturing,002HVP-E,0.219,Vote/Activism Against a Merger
1,4.55,-2.02,-0.443956,-79.17391,-76.22413,-66.030815,-41.898735,Retail Trade,006SZN-E,0.065,Board Seats
2,5.88,0.46,0.078231,15.748035,21.237123,40.33413,29.51542,Finance,00DFB7-E,,Maximize Shareholder Value
3,28.55,-0.8,-0.028021,4.963231,-6.485426,-11.555141,-0.695658,Consumer Services,007SD0-E,0.1339,Board Seats
4,46.5,2.29,0.049247,66.49214,34.4514,33.049644,31.327332,Process Industries,000KVL-E,0.0522,13D Filer - No Publicly Disclosed Activism


## 1.4 Missing Value

In [214]:
## Missing Value
data_processed.isnull().sum()

price_at_announcement                 476
ltm_eps_at_announcement               966
earnings_yield_at_announcement       1106
18_months_pre_date_total_return       827
1_year_pre_date_total_return          687
6_months_pre_date_total_return        609
90_days_pre_date_total_return         586
sector                                  7
activist_id                            52
ownership_pecent_on_announcements    1765
campaign_objective_primary              0
dtype: int64

## 1.5 Feature Engineering

In [215]:
## only keep top 200 activists, and change the rest to 'others'
top200_activists = data_processed.activist_id.value_counts().index[:200]
data_processed.activist_id = data_processed.activist_id.apply(lambda x : x if x in top200_activists else 'others')

In [216]:
data_processed.activist_id.value_counts()

others      3914
000KVL-E     557
000D7G-E     119
001TNS-E     112
0B8HNJ-E     112
            ... 
0FYLYX-E       6
0044B3-E       6
003KZ8-E       6
001TLD-E       6
0GTQZB-E       6
Name: activist_id, Length: 201, dtype: int64

# 2. Modeling

## 2.1 Split Training Set & Test Set

In [217]:
## split to training set and test set
X = data_processed.drop(columns=['campaign_objective_primary'])
y = data_processed['campaign_objective_primary']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

## 2.2 Data Process Pipeline

In [218]:
## process pipeline
process_scale = make_column_transformer(
    (make_pipeline(SimpleImputer(strategy='median'), StandardScaler()), 
     Continuous_Variables),
    (make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore')), 
     Categorical_Variables))

## 2.3 Logistic Regression Classifier

In [233]:
## Logistic Regression
pipeline_lr = make_pipeline(process_scale, LogisticRegression(
    max_iter=10000, verbose=0, multi_class='multinomial', solver='newton-cg'))
## Grid Search
param_grid_lr = {'logisticregression__C': np.logspace(-3, 3, 7)}
grid_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=5, scoring='accuracy')

In [234]:
grid_lr.fit(X_train, y_train.values.reshape(-1,))

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('pipeline-1',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('simpleimputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
  

In [235]:
## Test Score
grid_lr.score(X_test, y_test.values.reshape(-1,))

0.5378652355396542

## 2.4 Random Forest Classifier

In [236]:
## Random Forest
pipeline_rf = make_pipeline(process_scale, 
                            RandomForestClassifier(random_state = 1))
## Grid Search
param_grid_rf = {'randomforestclassifier__n_estimators': [100, 300, 500],
                'randomforestclassifier__max_features': ['auto', 'sqrt', 'log2']}
grid_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=3, scoring='accuracy')

In [237]:
grid_rf.fit(X_train, y_train.values.reshape(-1,))

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('pipeline-1',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('simpleimputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
  

In [238]:
grid_rf.score(X_test, y_test.values.reshape(-1,))

0.5533691115086464

In [241]:
Campaign_Objective

['Board Seats',
 'Maximize Shareholder Value',
 '13D Filer - No Publicly Disclosed Activism',
 'Vote For a Stockholder Proposal',
 'Vote/Activism Against a Merger',
 'Hostile/Unsolicited Acquisition']

In [242]:
confusion_matrix(y_test.values.reshape(-1,), grid_rf.predict(X_test), labels=Campaign_Objective)

array([[447,  99,  22,  11,   5,  16],
       [204, 149,  25,  15,   6,   4],
       [ 53,  22, 183,   1,   2,   0],
       [ 49,  17,   5, 124,   0,   5],
       [ 58,  21,   8,   4,  12,   4],
       [ 73,  13,   0,   4,   3,  13]])

**55% accuracy for 6-class classification. **

# 3. Next Step
* **X Variables:**            
    - Map avtivist_id to its aum & measure of past success  
    - Use daily based stock value 
    - Take market performance into consideration                      
    - Extra Data Source
* **Y Variable:**   
    - Better way to summarize campaign objectives since the difference between some objectives are ambiguous 
    - Imbalanced data
* **Feature Engineering:**
    - Transformation(polinomial/log/box-cox)                         
    - Dimension Reduction 
* **Modeling:**
    - PGM and other state-of-the-art models