In [1]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.tree import export_graphviz
import math
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import OrderedDict
from datetime import datetime, timedelta
from operator import itemgetter

In [3]:
X = pd.read_csv("X.csv")
Y = pd.read_csv("Y.csv", header=None)
Y = Y.values.flatten()
n = X.shape[0]
feature_list = ['total_assets', 'market_value_of_assets', 'tobinq', 'ppe', 'profitability', 'leverage', 'industry', 'pct_of_cash', 'pct_of_stocks']
time_dep_feature_list = ['total_assets', 'market_value_of_assets', 'tobinq', 'ppe', 'profitability', 'leverage'] # This keeps time-dependent features
special_feature_list = ['industry'] # This one keeps only categorical or time-independent features from Compustat
deal_feature_list = ['pct_of_cash', 'pct_of_stocks']

################################################
#### Default Run of Random Forest Algorithm ####
################################################
regr = RandomForestRegressor(random_state=0)
regr.fit(X, Y)   


# In[17]:

# This is our score vector for each feature
score_vec = regr.feature_importances_

# Get the name of n features with largest scores
def get_max_score_features(score_vec, column_names, n):
    """
    output: a dictionary with key being feature names and value being corresponding feature scores
    """
    max_ind = np.argsort(-score_vec)[0:n]
    out = OrderedDict()
    for ind in max_ind:
        out[column_names[ind]] = score_vec[ind]
    return out

score_dict = get_max_score_features(score_vec, X.columns, len(score_vec))
score_dict


# In[18]:

#####################################
#### Output Analysis Begins Here ####
#####################################

### Graph the Distribution of Scores ###
plt.hist(score_vec, bins = 'auto')
plt.xlabel('Score')
plt.title('Distribution of Feature Scores')
plt.show()
# Graph is highly skewed to the right

### Calculate Mean Score of Each Year Period Prior to Deal ###
# Recall that 3 refers to the year period 3 years prior to the deal,
# 2 refers to the year period 2 years prior to the deal, and 1 to just
# the year preceding the deal.

def period_mean_score(score_dict, num_yrs):
    out = OrderedDict()
    for i in range(num_yrs):
        search_string = '_' + str(i+1) + '_'
        out[i + 1] = np.mean([value for key, value in score_dict.items() if search_string in key.lower()])
    return out

### Calculate Mean Score of Each Feature ###
def feature_mean_score(score_dict, feature_list):
    feature_mean_score = {}
    for feature in feature_list:
        one_score = np.mean([value for key, value in score_dict.items() if feature in key.lower()])
        feature_mean_score[feature] = one_score
    out = OrderedDict(sorted(feature_mean_score.items(), key=itemgetter(1), reverse = True))
    return out

# Observe that market value of assets has a notably high score
feature_mean_score(score_dict, feature_list)

OrderedDict([('market_value_of_assets', 0.014749970894138835),
             ('profitability', 0.0084570677654764864),
             ('industry', 0.0056706321485244472),
             ('tobinq', 0.0051939753614112976),
             ('leverage', 0.004872540821724558),
             ('total_assets', 0.0016239030556651367),
             ('ppe', 0.0016109977754307501),
             ('pct_of_stocks', 0.00014295432251064478),
             ('pct_of_cash', 9.9452122296868903e-05)])

In [89]:
list(X)

['acq_total_assets_3_Q1',
 'acq_total_assets_3_Q2',
 'acq_total_assets_3_Q3',
 'acq_total_assets_3_Q4',
 'acq_total_assets_2_Q1',
 'acq_total_assets_2_Q2',
 'acq_total_assets_2_Q3',
 'acq_total_assets_2_Q4',
 'acq_total_assets_1_Q1',
 'acq_total_assets_1_Q2',
 'acq_total_assets_1_Q3',
 'acq_total_assets_1_Q4',
 'acq_market_value_of_assets_3_Q1',
 'acq_market_value_of_assets_3_Q2',
 'acq_market_value_of_assets_3_Q3',
 'acq_market_value_of_assets_3_Q4',
 'acq_market_value_of_assets_2_Q1',
 'acq_market_value_of_assets_2_Q2',
 'acq_market_value_of_assets_2_Q3',
 'acq_market_value_of_assets_2_Q4',
 'acq_market_value_of_assets_1_Q1',
 'acq_market_value_of_assets_1_Q2',
 'acq_market_value_of_assets_1_Q3',
 'acq_market_value_of_assets_1_Q4',
 'acq_tobinq_3_Q1',
 'acq_tobinq_3_Q2',
 'acq_tobinq_3_Q3',
 'acq_tobinq_3_Q4',
 'acq_tobinq_2_Q1',
 'acq_tobinq_2_Q2',
 'acq_tobinq_2_Q3',
 'acq_tobinq_2_Q4',
 'acq_tobinq_1_Q1',
 'acq_tobinq_1_Q2',
 'acq_tobinq_1_Q3',
 'acq_tobinq_1_Q4',
 'acq_ppe_3_Q1',

In [63]:
regr = linear_model.LinearRegression()
regr.fit(X, Y)

# Make predictions
Y_pred = regr.predict(X)

# Compute some statistics
res = Y - Y_pred
#standardized residuals
sig_hat = sqrt(sum(res**2)/(n-p-1))
h = X.dot(inv(X.transpose().dot(X))).dot(X.transpose())
h_diag = np.diag(h)
std_res = res/(sig_hat*np.sqrt(np.ones(len(h_diag))-h_diag))

# #predicted residuals
# pred_res = res/(1-h_diag)
# #RSS[i]
# RSS = sum(res^2)
# RSS_i = RSS - pred_res*res
# #standardized predicted residuals
# std_pred_res = (pred_res*sqrt(1 - h_diag))/sqrt(RSS_i/(n-p-2))
# #Cook's distance
# cook = std_res^2/(p+1)*h_diag/(1-h_diag)

In [85]:
X.drop(columns=['acq_industry_0', 'tar_industry_0'])

TypeError: drop() got an unexpected keyword argument 'columns'

In [86]:
type(X)

pandas.core.frame.DataFrame