In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTENC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

## Part 1 : Reading Data

In [3]:
data = pd.read_csv('new_data.csv')

In [4]:
data.columns

Index(['campaign_id', 'campaign_announcement_date', 'campaign_title',
       'campaign_objective_primary', 'value_demand', 'governance_demand',
       'activist_campaign_tactic', 'total_number_of_board_seats',
       'number_of_board_seats_sought', 'short_or_majority_or_full_slate',
       ...
       'post_12m_residual_return', 'post_18m_residual_return',
       'cumulative_6m_residual_return', 'cumulative_12m_residual_return',
       'cumulative_18m_residual_return', 'lagged_campaign_announcement_date',
       'past_return_successes', 'board_seats_percentage_sought',
       'board_seat_percentage_gained', 'board_seat_result_group'],
      dtype='object', length=119)

In [5]:
len(data.columns)

119

## Part 2 : Data Cleaning

In [6]:
for i in data.columns[-7:]:
    data[i] = data[i].apply(lambda x : 100 * x)

In [7]:
X_Variables = ['price_at_announcement', 'ltm_eps_at_announcement', 
                    'earnings_yield_at_announcement', 'pre_18m_stock_price', 
                    'pre_18m_price_to_earnings', 'pre_12m_price_to_earnings', 'pre_6m_price_to_earnings',
                    'pre_3m_price_to_earnings', 'pre_18m_dividends', 'pre_12m_dividends', 'pre_6m_dividends',
                    'pre_3m_dividends', 'pre_18m_total_return', 'pre_12m_total_return',
                    'pre_6m_total_return', 'pre_3m_total_return', 'sector',
                    'pre_6m_residual_return', 'pre_12m_residual_return', 'pre_18m_residual_return','past_return_successes']
Categorical_Variables = ['sector']
Continuous_Variables = [i for i in X_Variables if i not in Categorical_Variables]

In [8]:
# X_Variables =['price_at_announcement', 'ltm_eps_at_announcement', 
#                     'earnings_yield_at_announcement', 
#                     'pre_18m_stock_price', 'pre_12m_stock_price', 'pre_6m_stock_price','pre_3m_stock_price',
#                     'pre_18m_price_to_earnings', 'pre_12m_price_to_earnings', 'pre_6m_price_to_earnings',
#                     'pre_3m_price_to_earnings', 'pre_18m_dividends', 'pre_12m_dividends', 'pre_6m_dividends',
#                     'pre_3m_dividends', 'pre_18m_total_return', 'pre_12m_total_return',
#                     'pre_6m_total_return', 'pre_3m_total_return', 'sector',
#                     'pre_6m_residual_return', 'pre_12m_residual_return', 'pre_18m_residual_return',
#                    'pre_18m_price_return', 'pre_12m_price_return', 'pre_6m_price_return', 'pre_3m_price_return',
#                    'poison_pill_in_force_prior_to_announcement']

In [9]:
Categorical_Variables
Continuous_Variables

['price_at_announcement',
 'ltm_eps_at_announcement',
 'earnings_yield_at_announcement',
 'pre_18m_stock_price',
 'pre_18m_price_to_earnings',
 'pre_12m_price_to_earnings',
 'pre_6m_price_to_earnings',
 'pre_3m_price_to_earnings',
 'pre_18m_dividends',
 'pre_12m_dividends',
 'pre_6m_dividends',
 'pre_3m_dividends',
 'pre_18m_total_return',
 'pre_12m_total_return',
 'pre_6m_total_return',
 'pre_3m_total_return',
 'pre_6m_residual_return',
 'pre_12m_residual_return',
 'pre_18m_residual_return',
 'past_return_successes']

In [10]:
y_Varaibles = ["governance_demand"]
type(y_Varaibles)

list

In [11]:
governance_demand = ['Board Seats (activist group)','Other Governance Enhancements','Compensation Related Enhancements',
                    "Remove Director(s)","Social/Environmental/Political Issues","Remove Takeover Defenses","Add Independent Directors",
                     "Remove Officer(s)"
                    ]
governance_demand

['Board Seats (activist group)',
 'Other Governance Enhancements',
 'Compensation Related Enhancements',
 'Remove Director(s)',
 'Social/Environmental/Political Issues',
 'Remove Takeover Defenses',
 'Add Independent Directors',
 'Remove Officer(s)']

In [12]:
data.governance_demand = data.governance_demand.apply(lambda x : x if x in governance_demand else 'others')

In [13]:
data.governance_demand.value_counts() / len(data)

others                                   0.514471
Board Seats (activist group)             0.187128
Other Governance Enhancements            0.085049
Compensation Related Enhancements        0.069167
Remove Director(s)                       0.039599
Social/Environmental/Political Issues    0.037196
Remove Takeover Defenses                 0.031031
Add Independent Directors                0.028315
Remove Officer(s)                        0.008045
Name: governance_demand, dtype: float64

In [14]:
gover_demand_dict = {
    "Remove Officer(s)" : "Officer_demand",
    "Add Independent Directors" : "Officer_demand",
    "Remove Director(s)" : "Officer_demand",
    "Other Governance Enhancements" : "operations",
    "Compensation Related Enhancements" : "operations",
    'Social/Environmental/Political Issues': "operations",
    "Remove Takeover Defenses" : "operations",
    'Board Seats (activist group)' : "Board",
    "others" : "others"
}

In [15]:
data["governance_demand_group"] = data.governance_demand.apply(lambda x : gover_demand_dict[x])

In [16]:
data.governance_demand_group.value_counts()/len(data)

others            0.514471
operations        0.222443
Board             0.187128
Officer_demand    0.075959
Name: governance_demand_group, dtype: float64

In [17]:
data2 = data[data['governance_demand_group'] != 'others']

In [18]:
process_not_scale = make_column_transformer(
    (make_pipeline(SimpleImputer(strategy='median')), 
     Continuous_Variables),
    (make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore')), 
     Categorical_Variables))

In [19]:
process_scale = make_column_transformer(
    (make_pipeline(SimpleImputer(strategy='median'),StandardScaler()), 
     Continuous_Variables),
    (make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore')), 
     Categorical_Variables))

In [20]:
data1 = data[data['governance_demand'] != 'others']

In [21]:
X = data.drop(columns=['governance_demand'])
y = data['governance_demand']

X1 = data1.drop(columns=['governance_demand'])
y1 = data1['governance_demand']

X2 = data2.drop(columns=['governance_demand_group'])
y2 = data2['governance_demand_group']

In [22]:
ys = pd.DataFrame(y,columns=['governance_demand'])
ys1 = pd.DataFrame(y1,columns=['governance_demand'])
ys2 = pd.DataFrame(y2,columns=['governance_demand_group'])

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, ys, random_state=0, test_size=0.2)
#y_train.value_counts()

X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, ys1, random_state=0, test_size=0.2)
#y_train.value_counts()

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, ys2, random_state=0, test_size=0.2)

## Part 3 : Model Part

### Logistic Regression

In [22]:
def produce_confusion_matrix(x_test,y_test,model):
    pred = model.predict(x_test)
    matrix = confusion_matrix(y_test, pred, labels=y_test['governance_demand'].unique())
    precision = [matrix[i,i] / sum(matrix[:,i]) for i in range(matrix.shape[1])]
    recall = [matrix[i,i] / sum(matrix[i,:]) for i in range(matrix.shape[1])]
    d1 = pd.DataFrame(precision, columns=['Precision'], index = y_test['governance_demand'].unique())
    d2 = pd.DataFrame(recall, columns=['Recall'], index = y_test['governance_demand'].unique())
    precision_recall = pd.concat([d1,d2], axis=1)
    return matrix, precision_recall

In [29]:
# Logistic Regression
pipeline_lr = make_pipeline(process_scale, LogisticRegression(
    max_iter=10000, verbose=0, multi_class='multinomial', solver='newton-cg'))
## Grid Search
param_grid_lr = {'logisticregression__C': np.logspace(-3, 3, 7)}
grid_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=5, scoring='accuracy')
#X_train
#y_train
grid_lr.fit(X_train, y_train.values.reshape(-1,))

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('pipeline-1',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('simpleimputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
  

In [31]:
grid_lr.score(X_test, y_test)
print("The best parameter for Logistic is {}".format(grid_lr.best_params_))
print("Training accuracy for Logistic Regression is {}".format(round(grid_lr.score(X_train, y_train),5)))
print("Testing accuracy for Logistic Regression is {}".format(round(grid_lr.score(X_test, y_test),5)))

The best parameter for Logistic is {'logisticregression__C': 0.1}
Training accuracy for Logistic Regression is 0.51998
Testing accuracy for Logistic Regression is 0.53055


In [32]:
# index = y_test['governance_demand'].unique()

In [33]:
conf, pre_recall = produce_confusion_matrix(X_test, y_test, grid_lr) 
conf

  after removing the cwd from sys.path.


array([[  0, 334,   0,   0,   2,   0,   0,   0,   2],
       [  1, 995,   0,   0,   3,   0,   0,   0,   1],
       [  0,  15,   0,   0,   0,   0,   0,   0,   0],
       [  0, 137,   0,   0,   4,   0,   0,   0,   1],
       [  0,  55,   0,   0,  19,   0,   0,   0,   0],
       [  0,  51,   0,   0,   0,   0,   0,   0,   0],
       [  0, 155,   0,   0,   3,   0,   0,   0,   0],
       [  0,  61,   0,   0,   0,   0,   0,   0,   0],
       [  0,  74,   0,   0,   0,   0,   0,   0,   2]])

In [34]:
pre_recall

Unnamed: 0,Precision,Recall
Board Seats (activist group),0.0,0.0
others,0.530101,0.995
Remove Officer(s),,0.0
Compensation Related Enhancements,,0.0
Social/Environmental/Political Issues,0.612903,0.256757
Add Independent Directors,,0.0
Other Governance Enhancements,,0.0
Remove Takeover Defenses,,0.0
Remove Director(s),0.333333,0.026316


### Eight types classfier (lg)

In [35]:
grid_lr.fit(X_train1, y_train1.values.reshape(-1,))

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('pipeline-1',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('simpleimputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
  

In [36]:
#grid.score(X_test, y_test)
print("The best parameter for Logistic is {}".format(grid_lr.best_params_))
print("Training accuracy for Logistic Regression is {}".format(round(grid_lr.score(X_train1, y_train1),5)))
print("Testing accuracy for Logistic Regression is {}".format(round(grid_lr.score(X_test1, y_test1),5)))

The best parameter for Logistic is {'logisticregression__C': 1.0}
Training accuracy for Logistic Regression is 0.42965
Testing accuracy for Logistic Regression is 0.43118


In [37]:
conf, pre_recall = produce_confusion_matrix(X_test, y_test, grid_lr) 
conf

  after removing the cwd from sys.path.


array([[315,   0,   0,   0,   6,   0,   8,   0,   9],
       [882,   0,   0,  11,  31,   0,  40,   6,  30],
       [ 13,   0,   0,   0,   0,   0,   1,   0,   1],
       [114,   0,   0,   3,   8,   0,  12,   0,   5],
       [ 30,   0,   0,   2,  35,   0,   7,   0,   0],
       [ 43,   0,   0,   0,   4,   0,   1,   0,   3],
       [128,   0,   0,   0,  13,   0,  13,   0,   4],
       [ 54,   0,   0,   0,   4,   0,   2,   0,   1],
       [ 50,   0,   0,   1,   2,   0,   0,   0,  23]])

In [38]:
pre_recall

Unnamed: 0,Precision,Recall
Board Seats (activist group),0.19337,0.931953
others,,0.0
Remove Officer(s),,0.0
Compensation Related Enhancements,0.176471,0.021127
Social/Environmental/Political Issues,0.339806,0.472973
Add Independent Directors,,0.0
Other Governance Enhancements,0.154762,0.082278
Remove Takeover Defenses,0.0,0.0
Remove Director(s),0.302632,0.302632


### Three types classfier (lg)

In [32]:
pipeline_lr = make_pipeline(process_scale, LogisticRegression(
    max_iter=10000, verbose=0, multi_class='multinomial', solver='newton-cg'))
## Grid Search
param_grid_lr = {'logisticregression__C': np.logspace(-3, 3, 7)}
grid_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=5, scoring='accuracy')
#X_train
#y_train
grid_lr.fit(X_train2, y_train2.values.reshape(-1,))

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('pipeline-1',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('simpleimputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
  

In [250]:
print("The best parameter for Logistic is {}".format(grid_lr.best_params_))
print("Training accuracy for Logistic Regression is {}".format(round(grid_lr.score(X_train2, y_train2),5)))
print("Testing accuracy for Logistic Regression is {}".format(round(grid_lr.score(X_test2, y_test2),5)))

The best parameter for Logistic is {'logisticregression__C': 0.1}
Training accuracy for Logistic Regression is 0.50256
Testing accuracy for Logistic Regression is 0.49892


In [251]:
def produce_confusion_matrix(x_test,y_test,model):
    pred = model.predict(x_test)
    matrix = confusion_matrix(y_test, pred, labels=y_test['governance_demand_group'].unique())
    precision = [matrix[i,i] / sum(matrix[:,i]) for i in range(matrix.shape[1])]
    recall = [matrix[i,i] / sum(matrix[i,:]) for i in range(matrix.shape[1])]
    d1 = pd.DataFrame(precision, columns=['Precision'], index = y_test['governance_demand_group'].unique())
    d2 = pd.DataFrame(recall, columns=['Recall'], index = y_test['governance_demand_group'].unique())
    precision_recall = pd.concat([d1,d2], axis=1)
    return matrix, precision_recall

In [253]:
conf, pre_recall = produce_confusion_matrix(X_test2, y_test2, grid_lr) 
conf

array([[120, 232,   4],
       [ 99, 330,  10],
       [ 36,  85,  14]])

In [254]:
pre_recall

Unnamed: 0,Precision,Recall
Board,0.470588,0.337079
operations,0.510046,0.751708
Officer_demand,0.5,0.103704


### TreeModel (random forest and XGboost)

In [24]:
## Random Forest
pipeline_rf = make_pipeline(process_not_scale, 
                            RandomForestClassifier(random_state = 1))
## Grid Search
param_grid_rf = {'randomforestclassifier__n_estimators': [100, 300],
                'randomforestclassifier__max_features': ['auto', 'sqrt']}
grid_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=3, scoring='accuracy')

In [25]:
grid_rf.fit(X_train, y_train.values.reshape(-1,))

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('pipeline-1',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('simpleimputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
  

In [26]:
print("The best parameter for Logistic is {}".format(grid_rf.best_params_))
print("Training accuracy for Random Forest is {}".format(round(grid_rf.score(X_train, y_train),5)))
print("Testing accuracy for Random Forest is {}".format(round(grid_rf.score(X_test, y_test),5)))

The best parameter for Logistic is {'randomforestclassifier__max_features': 'auto', 'randomforestclassifier__n_estimators': 300}
Training accuracy for Random Forest is 0.98602
Testing accuracy for Random Forest is 0.54413


### Eight Type classfiers(rf)

In [42]:
grid_rf.fit(X_train1, y_train1.values.reshape(-1,))

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('pipeline-1',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('simpleimputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
  

In [43]:
print("The best parameter for Logistic is {}".format(grid_rf.best_params_))
print("Training accuracy for Random Forest is {}".format(round(grid_rf.score(X_train1, y_train1),5)))
print("Testing accuracy for Random Forest is {}".format(round(grid_rf.score(X_test1, y_test1),5)))

The best parameter for Logistic is {'randomforestclassifier__max_features': 'auto', 'randomforestclassifier__n_estimators': 300}
Training accuracy for Random Forest is 0.9887
Testing accuracy for Random Forest is 0.46882


In [44]:
## Random Forest
pipeline_rf = make_pipeline(process_not_scale, 
                            RandomForestClassifier(random_state = 1))
## Grid Search
param_grid_rf = {'randomforestclassifier__n_estimators': [100, 300],
                'randomforestclassifier__max_features': ['auto', 'sqrt']}
grid_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=3, scoring='accuracy')

### Three Type classfiers (rf)

In [27]:
grid_rf.fit(X_train2, y_train2.values.reshape(-1,))

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('pipeline-1',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('simpleimputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
  

In [28]:
print("The best parameter for Logistic is {}".format(grid_rf.best_params_))
print("Training accuracy for Random Forest is {}".format(round(grid_rf.score(X_train2, y_train2),5)))
print("Testing accuracy for Random Forest is {}".format(round(grid_rf.score(X_test2, y_test2),5)))

The best parameter for Logistic is {'randomforestclassifier__max_features': 'auto', 'randomforestclassifier__n_estimators': 300}
Training accuracy for Random Forest is 0.99139
Testing accuracy for Random Forest is 0.56882


In [29]:
def produce_confusion_matrix(x_test,y_test,model):
    pred = model.predict(x_test)
    matrix = confusion_matrix(y_test, pred, labels=y_test['governance_demand_group'].unique())
    precision = [matrix[i,i] / sum(matrix[:,i]) for i in range(matrix.shape[1])]
    recall = [matrix[i,i] / sum(matrix[i,:]) for i in range(matrix.shape[1])]
    d1 = pd.DataFrame(precision, columns=['Precision'], index = y_test['governance_demand_group'].unique())
    d2 = pd.DataFrame(recall, columns=['Recall'], index = y_test['governance_demand_group'].unique())
    precision_recall = pd.concat([d1,d2], axis=1)
    return matrix, precision_recall

In [30]:
conf, pre_recall = produce_confusion_matrix(X_test2, y_test2, grid_rf) 
conf

array([[194, 150,  12],
       [127, 291,  21],
       [ 51,  40,  44]])

In [31]:
pre_recall

Unnamed: 0,Precision,Recall
Board,0.521505,0.544944
operations,0.60499,0.66287
Officer_demand,0.571429,0.325926


### Feature importance (rf)

In [32]:
rf = RandomForestClassifier(n_estimators = 300,random_state = 1)
grid_rf.fit(X_train2, y_train2.values.reshape(-1,))

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('pipeline-1',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('simpleimputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
  

In [33]:
importance = grid_rf.best_estimator_.get_params()["randomforestclassifier"].feature_importances_

In [34]:
importance

array([0.07615968, 0.06480036, 0.04950005, 0.07450826, 0.04562511,
       0.04580476, 0.04615895, 0.04915544, 0.017587  , 0.01869069,
       0.02006869, 0.02336824, 0.0476659 , 0.04879073, 0.04864251,
       0.05361751, 0.04919471, 0.04842317, 0.04858011, 0.04115837,
       0.00463997, 0.00178623, 0.00324482, 0.00344611, 0.00527674,
       0.00254909, 0.00575668, 0.00587515, 0.00662037, 0.00210751,
       0.00543273, 0.00336918, 0.00512996, 0.00565666, 0.00393771,
       0.00390474, 0.0040882 , 0.00544317, 0.00210835, 0.00212637])

In [35]:
Continuous_Variables = np.array(Continuous_Variables)
#Continuous_Variables

In [36]:
importance.sort()
importance[::-1]

array([0.07615968, 0.07450826, 0.06480036, 0.05361751, 0.04950005,
       0.04919471, 0.04915544, 0.04879073, 0.04864251, 0.04858011,
       0.04842317, 0.0476659 , 0.04615895, 0.04580476, 0.04562511,
       0.04115837, 0.02336824, 0.02006869, 0.01869069, 0.017587  ,
       0.00662037, 0.00587515, 0.00575668, 0.00565666, 0.00544317,
       0.00543273, 0.00527674, 0.00512996, 0.00463997, 0.0040882 ,
       0.00393771, 0.00390474, 0.00344611, 0.00336918, 0.00324482,
       0.00254909, 0.00212637, 0.00210835, 0.00210751, 0.00178623])

In [37]:
#Continuous_Variables[importance.argsort()[::-1][:10]]

In [38]:
cat_names = grid_rf.best_estimator_.named_steps['columntransformer'].transformers_[1][1]\
   .named_steps['onehotencoder'].get_feature_names(Categorical_Variables)
features_name = np.array(list(Continuous_Variables) + list(cat_names))

In [39]:
indices = np.argsort(np.array(importance))[::-1][:30]
features_name[indices]

array(['sector_Utilities', 'sector_Transportation',
       'sector_Technology Services', 'sector_Retail Trade',
       'sector_Producer Manufacturing', 'sector_Process Industries',
       'sector_Non-Energy Minerals', 'sector_Miscellaneous',
       'sector_Industrial Services', 'sector_Health Technology',
       'sector_Health Services', 'sector_Finance',
       'sector_Energy Minerals', 'sector_Electronic Technology',
       'sector_Distribution Services', 'sector_Consumer Services',
       'sector_Consumer Non-Durables', 'sector_Consumer Durables',
       'sector_Communications', 'sector_Commercial Services',
       'past_return_successes', 'pre_18m_residual_return',
       'pre_12m_residual_return', 'pre_6m_residual_return',
       'pre_3m_total_return', 'pre_6m_total_return',
       'pre_12m_total_return', 'pre_18m_total_return', 'pre_3m_dividends',
       'pre_6m_dividends'], dtype='<U30')

In [40]:
# name = ['pre_3m_stock_price', 'ltm_eps_at_announcement',
#        'pre_6m_stock_price', 'pre_12m_stock_price', 'pre_18m_stock_price',
#        'price_at_announcement', 'pre_3m_total_return',
#        'earnings_yield_at_announcement', 'pre_6m_total_return',
#        'pre_18m_total_return']

In [41]:
# value = [0.07615968, 0.07450826, 0.06480036, 0.05361751, 0.04950005]

In [42]:
# d = {'name': name, 'value': value}

In [43]:
# df = pd.DataFrame(data=d)

In [44]:
#df

In [45]:
# import seaborn as sns
# sns.set()
# sns.set(rc={'figure.figsize':(12,10)})
# #ax = plt.gca()
# ax = df.plot.bar(x='name', y='value')
# plt.title('Top 10 important features for Governance Demand')

### SVM

In [None]:
from sklearn import svm
svc = svm.SVC(kernel='rbf')
svc.fit(X_train2,y_train2)
conf, pre_recall = produce_confusion_matrix(X_test2, y_test2, svc) 
conf

In [308]:


pre_recall

Unnamed: 0,Precision,Recall
Board,0.537313,0.303371
operations,0.539755,0.8041
Officer_demand,0.666667,0.37037


### Nearest 

In [310]:
from sklearn.neighbors import NearestCentroid

In [313]:
nc = NearestCentroid()
param_grid = {'metric':['euclidean','manhattan']}
grid = GridSearchCV(nc, param_grid, cv=5)
grid.fit(X_train2, y_train2)
print("The best parameter for NearestCentroid is {}".format(grid.best_params_))
print("Training accuracy for NearestCentroid is {}".format(round(grid.score(X_train2, y_train2),5)))
print("Testing accuracy for NearestCentroid is {}".format(round(grid.score(X_test2, y_test2),5)))

The best parameter for NearestCentroid is {'metric': 'manhattan'}
Training accuracy for NearestCentroid is 0.43906
Testing accuracy for NearestCentroid is 0.46882


In [315]:
conf, pre_recall = produce_confusion_matrix(X_test2, y_test2, grid) 
conf

array([[ 91, 130, 135],
       [ 61, 258, 120],
       [ 21,  27,  87]])

In [316]:
pre_recall

Unnamed: 0,Precision,Recall
Board,0.526012,0.255618
operations,0.621687,0.587699
Officer_demand,0.254386,0.644444


### Ensemble Model

In [323]:
import warnings
warnings.filterwarnings("ignore")
X = data2.drop(columns=['governance_demand_group'])
y = data2['governance_demand_group']
X = process_not_scale.fit(X,y).transform(X)
y = pd.DataFrame(y,columns=['governance_demand_group'])
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=0, test_size=0.2)
eclf2 = VotingClassifier(estimators=[('lg', LogisticRegression(C = 0.1)), 
                                     ('rf', RandomForestClassifier(n_estimators = 300)),
                                    ('nc', NearestCentroid(metric =  'manhattan')),
                                    ('svc', svm.SVC(kernel='rbf'))], 
                         voting='hard')
eclf2 = eclf2.fit(X_train2, y_train2)
y_pred2 = eclf2.predict(X_test2)
print("Accuracy is %.2f%%" % ((np.sum(y_pred2 == np.array(y_test2).flatten()) / len(y_pred2))*100))

Accuracy is 55.38%


In [324]:
def produce_confusion_matrix(x_test,y_test,model):
    pred = model.predict(x_test)
    matrix = confusion_matrix(y_test, pred, labels=y_test['governance_demand_group'].unique())
    precision = [matrix[i,i] / sum(matrix[:,i]) for i in range(matrix.shape[1])]
    recall = [matrix[i,i] / sum(matrix[i,:]) for i in range(matrix.shape[1])]
    d1 = pd.DataFrame(precision, columns=['Precision'], index = y_test['governance_demand_group'].unique())
    d2 = pd.DataFrame(recall, columns=['Recall'], index = y_test['governance_demand_group'].unique())
    precision_recall = pd.concat([d1,d2], axis=1)
    return matrix, precision_recall

In [325]:
conf, pre_recall = produce_confusion_matrix(X_test2, y_test2, eclf2) 
conf

array([[153, 176,  27],
       [102, 304,  33],
       [ 38,  39,  58]])

In [326]:
pre_recall

Unnamed: 0,Precision,Recall
Board,0.522184,0.429775
operations,0.585742,0.692483
Officer_demand,0.491525,0.42963
