In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import RidgeCV
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor



In [2]:
df = pd.read_pickle('datasets/features-label-text-represented-subset.pkl')
df_copy = df.copy()

In [3]:
df_copy.shape[0]

43290

In [9]:
df_copy.head()

Unnamed: 0,CIK,file_id,year,roe,opinc,nopinc,token_count,roe_next_year,tone,text_vector
0,1750,3,2005,0.095362,0.131652,-0.015174,2120,0.127945,-0.000472,"[0.90494365, 0.074863456, -0.22782834, 0.33961..."
1,1750,4,2006,0.127945,0.149182,-0.022282,3219,0.13922,0.0,"[0.9111878, 0.08885506, -0.31519878, 0.4127013..."
2,1750,7,2009,0.063607,0.105913,-0.030043,3364,0.088296,0.0,"[0.7394947, 0.017999234, -0.41983142, 0.052898..."
3,1750,8,2010,0.088296,0.136418,-0.040048,3045,0.079613,-0.000328,"[0.9311413, 0.0732959, -0.42346224, 0.16167574..."
4,1750,9,2011,0.079613,0.138888,-0.026679,2503,0.061607,-0.000799,"[0.8763539, 0.0571177, -0.4936092, 0.20794152,..."


Retrieve train and test file_ids

In [4]:
train_ids = pd.read_csv('train-test-ids/train-ids.csv')
test_ids = pd.read_csv('train-test-ids/test-ids.csv')

train_data = df_copy[df_copy['file_id'].isin(train_ids['file_id'].astype(str))]
test_data = df_copy[df_copy['file_id'].isin(test_ids['file_id'].astype(str))]

In [5]:
train_data.shape

(32113, 10)

In [6]:
test_data.shape

(11177, 10)

In [8]:
def get_test_mse(model, test_features, test_label):
    test_pred = model.predict(test_features)
    return mean_squared_error(test_label, test_pred)

# Part 4: ROE + tone + word embedding

Model1.3 ROE_t, tone_t, text_vector_t -> ROE_t+1

In [11]:
# features concatenation
train_features_m1_3 = np.concatenate((np.array(train_data['text_vector'].tolist()), train_data['roe'].values.reshape(-1, 1), train_data['tone'].values.reshape(-1, 1)), axis=1)
train_label_m1_3 = train_data['roe_next_year'].values

test_features_m1_3 = np.concatenate((np.array(test_data['text_vector'].tolist()), test_data['roe'].values.reshape(-1, 1), test_data['tone'].values.reshape(-1, 1)), axis=1)
test_label_m1_3 = test_data['roe_next_year'].values

In [14]:
# normalisation
train_features_m1_3_norm = (train_features_m1_3 - np.mean(train_features_m1_3, axis=0)) / np.std(train_features_m1_3, axis=0)
test_features_m1_3_norm = (test_features_m1_3 - np.mean(test_features_m1_3, axis=0)) / np.std(test_features_m1_3, axis=0)

Model1.3 Linear regression

In [15]:
model1_3_lr = LinearRegression()
model1_3_lr.fit(train_features_m1_3_norm, train_label_m1_3)

In [16]:
model1_3_lr_mse = get_test_mse(
    model1_3_lr,
    test_features_m1_3_norm,
    test_label_m1_3
)
model1_3_lr_mse

0.03912976256587879

Model1.3 Ridge regression

In [31]:
model1_3_ridge = RidgeCV(alphas=[1e-1, 1, 3, 5, 10, 20, 30, 40, 50, 100])
model1_3_ridge.fit(train_features_m1_3_norm, train_label_m1_3)

In [32]:
model1_3_ridge.alpha_

100.0

In [33]:
model1_3_ridge_mse = get_test_mse(
    model1_3_ridge,
    test_features_m1_3_norm,
    test_label_m1_3
)
model1_3_ridge_mse

0.0391445704232656

Model1.3 Decision tree

In [34]:
# with trial hyperparameters
model1_3_dt = DecisionTreeRegressor(max_depth=3)
model1_3_dt.fit(train_features_m1_3_norm, train_label_m1_3)

In [35]:
model1_3_dt_mse = get_test_mse(
    model1_3_dt,
    test_features_m1_3_norm,
    test_label_m1_3
)
model1_3_dt_mse

0.039247112551174065

In [36]:
dt_params = {
    'max_depth': [2, 3, 5, 10, 20],
    'min_samples_leaf': [10, 20, 50, 100, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'splitter': ['best', 'random']
}

model1_3_dt_tune = GridSearchCV(estimator=model1_3_dt,
                                param_grid=dt_params,
                                n_jobs=-1)
model1_3_dt_tune.fit(train_features_m1_3_norm, train_label_m1_3)





In [37]:
model1_3_dt_tune.best_params_

{'max_depth': 5,
 'max_features': 'auto',
 'min_samples_leaf': 50,
 'splitter': 'best'}

In [38]:
model1_3_dt_best = model1_3_dt_tune.best_estimator_
model1_3_dt_best_mse = get_test_mse(
    model1_3_dt_best,
    test_features_m1_3_norm,
    test_label_m1_3
)
model1_3_dt_best_mse

0.03832947306114129

Model1.3 Gradient boosting

In [39]:
model1_3_gb = GradientBoostingRegressor()

In [40]:
model1_3_gb.fit(train_features_m1_3_norm, train_label_m1_3)

In [42]:
model1_3_gb_mse = get_test_mse(
    model1_3_gb,
    test_features_m1_3_norm,
    test_label_m1_3
)
model1_3_gb_mse

0.037255816723241145

In [43]:
gb_params = {
    'n_estimators': [50, 100, 150],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [3, 5, 10, 20, 50],
    'min_samples_split': [2, 5, 10, 20, 50, 100],
    'min_samples_leaf': [1, 5, 10, 20, 50, 100],
    'learning_rate': [0.1, 0.01, 0.001]
}

# use RandomizedSearchCV for save computation
model1_3_gb_tune = RandomizedSearchCV(estimator=model1_3_gb,
                                      param_distributions=gb_params,
                                      random_state=42,
                                      n_iter=100,
                                      n_jobs=-1)
model1_3_gb_tune.fit(train_features_m1_3_norm, train_label_m1_3)



In [44]:
model1_3_gb_tune.best_params_

{'n_estimators': 150,
 'min_samples_split': 10,
 'min_samples_leaf': 100,
 'max_features': 'sqrt',
 'max_depth': 5,
 'learning_rate': 0.1}

In [45]:
model1_3_gb_best = model1_3_gb_tune.best_estimator_
model1_3_gb_best_mse = get_test_mse(
    model1_3_gb_best,
    test_features_m1_3_norm,
    test_label_m1_3
)
model1_3_gb_best_mse

0.037170629842238