In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import sys
sys.version

'3.5.2 (default, Nov 23 2017, 16:37:01) \n[GCC 5.4.0 20160609]'

In [2]:
import pandas as pd
import os
import copy
import numpy as np
import xgboost

from pythainlp.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score
from sklearn import preprocessing

from sklearn import ensemble
from sklearn import tree
from sklearn import linear_model
import pickle

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

import plotly.graph_objs as go
from datetime import datetime, timedelta
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
% matplotlib inline

def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100



target_stocks = ['BANPU','IRPC','PTT','BBL','KBANK','SCB','AOT','THAI','CPF','MINT',
                 'TU','SCC','CPN','CK','CPALL','HMPRO','BDMS','BH','ADVANC','JAS','TRUE']


# Load Models

In [46]:
lineregr = pickle.load(open('models/lineregr.pkl', 'rb'))
decis_tree_regr = pickle.load(open('models/decis_tree_regr.pkl', 'rb'))
rnd_forest_regr = pickle.load(open('models/rnd_forest_regr.pkl', 'rb'))
adaboost_dt_regr = pickle.load(open('models/adaboost_dt_regr.pkl', 'rb'))
adaboost_rf_regr = pickle.load(open('models/adaboost_rf_regr.pkl', 'rb'))
gbr = pickle.load(open('models/gbr.pkl', 'rb'))
xgb = pickle.load(open('models/xgb.pkl', 'rb'))

In [47]:
le = pickle.load(open('models/le.pkl', 'rb'))

# Load Test

In [48]:
x_test = pd.read_csv('data/x_test_unique_news.csv')
x_test = x_test.set_index('Date')
x_test.head(1)

Horizon = 'Close(t+1)'
y_test = x_test[[Horizon]]
x_test = x_test.drop(['Close(t+1)'], axis=1).copy()
x_test.shape, x_test.shape

close_t = np.reshape(x_test['Close(t)'].values, (-1, 1))
changes = y_test.values - close_t
y_direction = np.array([1 if change >= 0 else 0 for change in changes]).reshape(-1,1)

y_true = np.concatenate((y_test, y_direction), axis=1)
df_true = pd.DataFrame.from_records(y_true).round(2)
df_true.columns = ['actual', 'ditection']
df_true.head(1)

Unnamed: 0_level_0,Ticker,Close(t+1),Open(t),High(t),Low(t),Close(t),Open(t-1),High(t-1),Low(t-1),Close(t-1),...,490,491,492,493,494,495,496,497,498,499
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-27,2,19.2,19.6,19.8,19.4,19.6,20.1,20.1,19.6,19.6,...,0.03908,0.0,0.0,0.043692,0.0,0.0,0.0,0.028074,0.122086,0.0


((503, 517), (503, 517))

Unnamed: 0,actual,ditection
0,19.2,0.0


#  Prediction

In [49]:
y_pred = np.concatenate((lineregr.predict(x_test).reshape(-1,1),
                         decis_tree_regr.predict(x_test).reshape(-1,1),
                         rnd_forest_regr.predict(x_test).reshape(-1,1),
                         adaboost_dt_regr.predict(x_test).reshape(-1,1),
                         adaboost_rf_regr.predict(x_test).reshape(-1,1),
                         gbr.predict(x_test).reshape(-1,1),
                         xgb.predict(xgboost.DMatrix(x_test)).reshape(-1,1)), axis=1)

df_pred = pd.DataFrame.from_records(y_pred).round(2)
df_pred.columns = ['Linear', 'DT', 'RF', 'Ada_DT', 'Ada_RF', 'GB', 'XGB']
df_pred.head()

Unnamed: 0,Linear,DT,RF,Ada_DT,Ada_RF,GB,XGB
0,21.01,19.7,19.42,19.6,20.07,25.26,19.65
1,21.48,20.0,19.55,19.6,19.6,25.14,19.59
2,18.91,19.8,19.07,19.3,19.25,24.94,19.1
3,22.19,19.7,19.62,19.8,19.74,25.29,19.71
4,21.23,20.5,20.53,20.5,20.7,26.18,20.66


In [41]:
stack = ensemble.RandomForestRegressor(n_jobs=-1, max_depth=None, n_estimators=10)
stack.fit(df_pred.values, y_test.values)
feature_importances = stack.feature_importances_

y_new_pred = stack.predict(df_pred.values).reshape(-1,1)

RMSE = np.sqrt(mean_squared_error(y_test, y_new_pred))
MAE = mean_absolute_error(y_test, y_new_pred)
MAPE = mean_absolute_percentage_error(y_test, y_new_pred)

print("RMSE: %.2f \tMAE: %.2f \tMAPE: %.2f" % (RMSE, MAE, MAPE))

for i, clf in enumerate(df_pred):
    print(clf, '\t', round(feature_importances[i],4))


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

RMSE: 1.13 	MAE: 0.58 	MAPE: 0.67
Linear 	 0.0202
DT 	 0.1615
RF 	 0.0826
Ada_DT 	 0.2297
Ada_RF 	 0.2634
GB 	 0.0391
XGB 	 0.2036


# Direction

In [42]:
y_changes = np.concatenate((np.array(y_pred[:,0]).reshape(-1,1)-close_t,
                            np.array(y_pred[:,1]).reshape(-1,1)-close_t,
                            np.array(y_pred[:,2]).reshape(-1,1)-close_t,
                            np.array(y_pred[:,3]).reshape(-1,1)-close_t,
                            np.array(y_pred[:,4]).reshape(-1,1)-close_t,
                            np.array(y_pred[:,5]).reshape(-1,1)-close_t,
                           np.array(y_pred[:,6]).reshape(-1,1)-close_t,
                           ), axis=1)
y_pred_dir = []
for row in y_changes:
    tmp_row = []

    for change in row:
        tmp_row.append(1 if change>=0 else 0)
    y_pred_dir.append(tmp_row)
    
df_pred_dir = pd.DataFrame.from_records(y_pred_dir)
df_pred_dir.columns = ['Linear', 'DT', 'RF', 'Ada_DT', 'Ada_RF', 'GB', 'XGB']
df_pred_dir.head()

Unnamed: 0,Linear,DT,RF,Ada_DT,Ada_RF,GB,XGB
0,1,1,0,1,1,1,1
1,1,1,1,1,1,1,1
2,0,1,0,1,1,1,0
3,1,1,1,1,1,1,1
4,1,0,0,0,0,1,0


# Evaluate Individual Model

In [50]:
for clf in df_pred:
    y_tmp = copy.deepcopy(df_pred[clf].values.reshape(-1,1))
    RMSE = np.sqrt(mean_squared_error(y_test, y_tmp))
    MAE = mean_absolute_error(y_test, y_tmp)
    MAPE = mean_absolute_percentage_error(y_test, y_tmp)

    DA = accuracy_score(y_direction, df_pred_dir[clf])
    print(clf, "\tRMSE: %.2f \tMAE: %.2f \tMAPE: %.2f \tDA: %.2f" % (RMSE, MAE, MAPE, DA))

Linear 	RMSE: 2.94 	MAE: 2.20 	MAPE: 6.97 	DA: 0.49
DT 	RMSE: 4.83 	MAE: 2.46 	MAPE: 2.32 	DA: 0.56
RF 	RMSE: 3.39 	MAE: 1.71 	MAPE: 1.86 	DA: 0.55
Ada_DT 	RMSE: 2.98 	MAE: 1.55 	MAPE: 1.55 	DA: 0.57
Ada_RF 	RMSE: 2.87 	MAE: 1.59 	MAPE: 1.86 	DA: 0.55
GB 	RMSE: 8.35 	MAE: 6.48 	MAPE: 23.29 	DA: 0.52
XGB 	RMSE: 2.78 	MAE: 1.49 	MAPE: 1.50 	DA: 0.56


# Ensenble Voting and Weighted Average

In [44]:
df_pred['weight'] = (df_pred['Linear']*feature_importances[0] + 
                     df_pred['DT']*feature_importances[1] + 
                     df_pred['RF']*feature_importances[2] + 
                     df_pred['Ada_DT']*feature_importances[3] + 
                     df_pred['Ada_RF']*feature_importances[4] + 
                     df_pred['GB']*feature_importances[5] + 
                     df_pred['XGB']*feature_importances[6])

df_pred_dir['vote'] = (df_pred_dir['Linear'] + df_pred_dir['DT'] + df_pred_dir['RF'] + df_pred_dir['Ada_DT'] + df_pred_dir['Ada_RF'] + df_pred_dir['GB'] + df_pred_dir['XGB'])/7
df_pred_dir.vote = df_pred_dir.vote.round(0)

y_pred = copy.deepcopy(df_pred['weight'].values.reshape(-1,1))
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
MAE = mean_absolute_error(y_test, y_pred)
MAPE = mean_absolute_percentage_error(y_test, y_pred)

DA = accuracy_score(y_direction, df_pred_dir['vote'].values.reshape(-1,1))
print("RMSE: %.2f \tMAE: %.2f \tMAPE: %.2f \tDA: %.2f" % (RMSE, MAE, MAPE, DA))

RMSE: 2.78 	MAE: 1.57 	MAPE: 2.05 	DA: 0.58


In [45]:
df_pred['weight'] = (df_pred['Linear'] + df_pred['DT'] + df_pred['RF'] + df_pred['Ada_DT'] + df_pred['Ada_RF'] + df_pred['GB'] + df_pred['XGB'])/7

df_pred_dir['vote'] = (df_pred_dir['Linear'] + df_pred_dir['DT'] + df_pred_dir['RF'] + df_pred_dir['Ada_DT'] + df_pred_dir['Ada_RF'] + df_pred_dir['GB'] + df_pred_dir['XGB'])/7
df_pred_dir.vote = df_pred_dir.vote.round(0)

y_pred = copy.deepcopy(df_pred['weight'].values.reshape(-1,1))
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
MAE = mean_absolute_error(y_test, y_pred)
MAPE = mean_absolute_percentage_error(y_test, y_pred)

DA = accuracy_score(y_direction, df_pred_dir['vote'].values.reshape(-1,1))
print("RMSE: %.2f \tMAE: %.2f \tMAPE: %.2f \tDA: %.2f" % (RMSE, MAE, MAPE, DA))

RMSE: 2.97 	MAE: 1.86 	MAPE: 4.16 	DA: 0.58
