# Regression with news (all)

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import sys
sys.version

import pandas as pd
import os
import copy
import numpy as np
import xgboost
import pickle


from pythainlp.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score
from sklearn import preprocessing

from sklearn.linear_model import Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

from plotly import tools
import plotly.graph_objs as go
from datetime import datetime, timedelta
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
% matplotlib inline

In [2]:
target_stocks = ['BANPU','IRPC','PTT','BBL','KBANK','SCB','AOT','THAI','CPF','MINT',
                 'TU','SCC','CPN','CK','CPALL','HMPRO','BDMS','BH','ADVANC','JAS','TRUE']

df_price = pd.read_csv('merged_2013_2018.csv')
df_price['Date'] = pd.to_datetime(df_price['Date'], format='%Y-%m-%d')
df_price = df_price.loc[df_price['Ticker'].isin(target_stocks)]
df_price['Date'] = df_price['Date'].dt.date
df_price = df_price.set_index('Date')
df_price.tail(3)
len(df_price)

df_kaohoon = pd.read_csv('data/kaohoon_all.csv')
df_kaohoon['Date'] = pd.to_datetime(df_kaohoon['Date'], format='%Y-%m-%d')
df_kaohoon = df_kaohoon.set_index('Date')
df_kaohoon = df_kaohoon[:'2018-2-8']
df_kaohoon.index = df_kaohoon.index.date
df_kaohoon.tail(3)
len(df_kaohoon)

df_moneych = pd.read_csv('data/moneychanel_all.csv')
df_moneych['Date'] = pd.to_datetime(df_moneych['Date'], format='%Y-%m-%d')
df_moneych = df_moneych.set_index('Date')
df_moneych = df_moneych[:'2018-2-8']
df_moneych.index = df_moneych.index.date
df_moneych.tail(3)
len(df_moneych)

df_news = pd.concat([df_moneych, df_kaohoon])
df_news = df_news.sort_index()

Unnamed: 0_level_0,Ticker,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-02-16,THAI,15.6,15.9,15.6,15.6,2907700
2018-02-16,TRUE,6.4,6.4,6.3,6.3,35851300
2018-02-16,TU,20.0,20.5,20.0,20.1,7299300


26331

Unnamed: 0,Ticker,Text
2018-02-08,TU,หลักทรัพย์ ล่าสุด เปลี่ยนแปลง เปลี่ยนแปลง ปริม...
2018-02-08,BANPU,ผู้สื่อข่าว รายงาน ว่า ดัชนี ปิด ตลาด วันนี้ ท...
2018-02-08,PTT,ผู้สื่อข่าว รายงาน ว่า ดัชนี ปิด ตลาด วันนี้ ท...


118355

Unnamed: 0,Ticker,Text
2018-02-07,HMPRO,เปิด ตลาด ภาค เช้า พุธ ก พรีบาวด์ ตาม สหรัฐฯ แ...
2018-02-08,PTT,ตลาดหุ้น ไทย เปิด บวก แรง ซื้อ หุ้น นำ ตลาด ผส...
2018-02-08,IRPC,เปิด ตลาด ภาค เช้า พฤหัสฯ กพ วอ ลุ่ม ขาย มุ่ง ...


5084

# Lag & Horizon Construction

In [5]:
N_lags = 3
N_horizon = 1

df_train = []
df_test = []
for stock in tqdm_notebook(target_stocks):
    news_stocks = []
    df_stock = df_news.loc[df_news['Ticker'] == stock]
    prev_date = None
    prev_text = None
    
#     pbar = tqdm_notebook(total=len(df_stock))
    for date, row in df_stock.iterrows():
        if prev_date == None:
            prev_date = date
            prev_text = row['Text']
        elif prev_date != date:
            # horizon
            tmp_date = copy.deepcopy(prev_date)
            tmp_date += timedelta(days=1)
            prices = []
            count_lags = 0 
            while count_lags < N_horizon:
                price = df_price.loc[(df_price.index == tmp_date) & (df_price['Ticker'] == stock)].values
                tmp_date += timedelta(days=1)
                if len(price) == 0: continue
                prices.append(price[0][4]) # Close price next day(s)
                count_lags+=1
            
            # lag
            tmp_date = copy.deepcopy(prev_date)
            count_lags = 0 
            while count_lags <= N_lags:
                price = df_price.loc[(df_price.index == tmp_date) & (df_price['Ticker'] == stock)].values
                tmp_date -= timedelta(days=1)
                if len(price) == 0: continue
                for val in price[0][:-1]: 
                    if type(val) != str: prices.append(val)
                count_lags+=1

            news_stocks.append([prev_date, stock, prev_text] + prices)
            
            prev_date = date
            prev_text = row['Text']
        elif prev_date == date:
            prev_text += ' '+row['Text']
        
#         pbar.update(1)
#     pbar.close()

    news_stocks = pd.DataFrame.from_records(news_stocks)
    news_stocks.columns = ['Date', 'Ticker', 'Text', 
                           'Close(t+1)', # 'Close(t+2)','Close(t+3)','Close(t+4)','Close(t+5)',
                           'Open(t)', 'High(t)', 'Low(t)', 'Close(t)',
                           'Open(t-1)', 'High(t-1)', 'Low(t-1)', 'Close(t-1)',
                           'Open(t-2)', 'High(t-2)', 'Low(t-2)', 'Close(t-2)',
                           'Open(t-3)', 'High(t-3)', 'Low(t-3)', 'Close(t-3)',
#                            'Open(t-4)', 'High(t-4)', 'Low(t-4)', 'Close(t-4)',
#                            'Open(t-5)', 'High(t-5)', 'Low(t-5)', 'Close(t-5)'
                          ]
    news_stocks = news_stocks.set_index('Date')
    
    train_size = int(len(news_stocks) * 0.95)
    test_size = len(news_stocks) - train_size
    train, test = news_stocks.iloc[:train_size], news_stocks.iloc[train_size:]
    print(stock, ':\t',len(train), len(test))    
    df_train.append(train)
    df_test.append(test)
    

df_train = pd.concat(df_train, axis=0)
df_test = pd.concat(df_test, axis=0)

BANPU :	 860 46
IRPC :	 868 46
PTT :	 882 47
BBL :	 877 47
KBANK :	 888 47
SCB :	 890 47
AOT :	 876 47
THAI :	 840 45
CPF :	 868 46
MINT :	 873 46
TU :	 669 36
SCC :	 868 46
CPN :	 862 46
CK :	 852 45
CPALL :	 861 46
HMPRO :	 856 46
BDMS :	 827 44
BH :	 848 45
ADVANC :	 882 47
JAS :	 793 42
TRUE :	 866 46



In [6]:
df_train.to_csv('data/df_train_news_all.csv')
df_test.to_csv('data/df_test_news_all.csv')

df_train = pd.read_csv('data/df_train_news_all.csv')
df_test = pd.read_csv('data/df_test_news_all.csv')

df_train = df_train.set_index('Date')
df_test = df_test.set_index('Date')

len(df_train), len(df_test) 

df_train.head(2)
df_test.head(2)

(17906, 953)

Unnamed: 0_level_0,Ticker,Text,Close(t+1),Open(t),High(t),Low(t),Close(t),Open(t-1),High(t-1),Low(t-1),Close(t-1),Open(t-2),High(t-2),Low(t-2),Close(t-2),Open(t-3),High(t-3),Low(t-3),Close(t-3)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2015-01-16,BANPU,ใน ช่วง ท้าย ปี และ ต้นปี เรา จะ ได้ยิน คา ว่า...,23.9,23.9,24.0,23.8,23.9,24.4,24.4,23.9,23.9,24.6,24.7,24.3,24.3,24.9,24.9,24.5,24.6
2015-01-19,BANPU,หลักทรัพย์ ล่าสุด เปลี่ยนแปลง เปลี่ยนแปลง ปริม...,24.0,24.0,24.0,23.7,23.9,23.9,24.0,23.8,23.9,24.4,24.4,23.9,23.9,24.6,24.7,24.3,24.3


Unnamed: 0_level_0,Ticker,Text,Close(t+1),Open(t),High(t),Low(t),Close(t),Open(t-1),High(t-1),Low(t-1),Close(t-1),Open(t-2),High(t-2),Low(t-2),Close(t-2),Open(t-3),High(t-3),Low(t-3),Close(t-3)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2017-12-18,BANPU,หลักทรัพย์ จำนวน รายการ ปริมาณ หุ้น มูลค่า บาท...,18.7,17.6,17.7,17.5,17.7,17.3,17.7,17.3,17.5,17.3,17.3,17.1,17.2,17.1,17.4,17.0,17.2
2017-12-19,BANPU,ผู้สื่อข่าว รายงาน ว่า บริษัท บ้าน ปู จำกัด มห...,18.7,17.8,18.8,17.7,18.7,17.6,17.7,17.5,17.7,17.3,17.7,17.3,17.5,17.3,17.3,17.1,17.2


# TF-IDF Vetorization

In [7]:
stop_words = stopwords.words('thai')
stop_words.remove('ขึ้น')
stop_words.remove('ลง')

vertorizer = TfidfVectorizer(stop_words=stop_words, 
                             max_df=0.9, 
                             min_df=2, 
                             max_features=1000)

tfidf_train = vertorizer.fit_transform(df_train['Text'])
tfidf_test = vertorizer.transform(df_test['Text'])

df_tfidf_train = pd.DataFrame.from_records(tfidf_train.toarray())
df_tfidf_test = pd.DataFrame.from_records(tfidf_test.toarray())

df_tfidf_train = df_tfidf_train.set_index(df_train.index)
df_tfidf_test = df_tfidf_test.set_index(df_test.index)

len(df_tfidf_train), len(df_tfidf_test)

# replace Text with TF-IDF vector
x_train = df_train.drop(['Text'], axis=1)
x_train = pd.concat([x_train, df_tfidf_train], axis=1)

x_test = df_test.drop(['Text'], axis=1)
x_test = pd.concat([x_test, df_tfidf_test], axis=1)

# Label Encoding
le = preprocessing.LabelEncoder()
x_train['Ticker'] = le.fit_transform(x_train['Ticker'])
x_test['Ticker'] = le.transform(x_test['Ticker'])
x_train.head(2)
x_test.head(2)
le.classes_

(17906, 953)

Unnamed: 0_level_0,Ticker,Close(t+1),Open(t),High(t),Low(t),Close(t),Open(t-1),High(t-1),Low(t-1),Close(t-1),...,990,991,992,993,994,995,996,997,998,999
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-16,2,23.9,23.9,24.0,23.8,23.9,24.4,24.4,23.9,23.9,...,0.017004,0.0,0.0,0.021164,0.0,0.028626,0.0,0.0,0.072264,0.0
2015-01-19,2,24.0,24.0,24.0,23.7,23.9,23.9,24.0,23.8,23.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0_level_0,Ticker,Close(t+1),Open(t),High(t),Low(t),Close(t),Open(t-1),High(t-1),Low(t-1),Close(t-1),...,990,991,992,993,994,995,996,997,998,999
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-12-18,2,18.7,17.6,17.7,17.5,17.7,17.3,17.7,17.3,17.5,...,0.035358,0.0,0.0,0.220045,0.035818,0.0,0.04259,0.0,0.0,0.0
2017-12-19,2,18.7,17.8,18.8,17.7,18.7,17.6,17.7,17.5,17.7,...,0.047471,0.0,0.0,0.059086,0.0,0.02664,0.0,0.0,0.033625,0.0


array(['ADVANC', 'AOT', 'BANPU', 'BBL', 'BDMS', 'BH', 'CK', 'CPALL',
       'CPF', 'CPN', 'HMPRO', 'IRPC', 'JAS', 'KBANK', 'MINT', 'PTT',
       'SCB', 'SCC', 'THAI', 'TRUE', 'TU'], dtype=object)

# Create x_train and y_train

In [8]:
Horizon = 'Close(t+1)'
y_train = x_train[[Horizon]]
x_train = x_train.drop(['Close(t+1)'], axis=1).copy()
x_train.shape, y_train.shape

((17906, 1017), (17906, 1))

# Evaluate Each Stcok

In [9]:
def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def evaluator(clf, df_test, le, isXGB=False, isLSTM=False):
    RMSEs, MAEs, MAPEs, DAs = [], [], [], []
    results = []
    for stock in target_stocks:
        x_tmp = df_test.loc[df_test['Ticker'] == le.transform([stock])[0]].copy()
        
        y_tmp = x_tmp[Horizon].values
        
        # Directional Accuracy
        changes = x_tmp[Horizon] -  x_tmp['Close(t)']
        y_true_da = []
        for change in changes:
            y_true_da.append(1 if change >= 0 else 0)
                
        x_tmp = x_tmp.drop(['Close(t+1)'], axis=1)
        
        if isXGB:
            y_pred = clf.predict(xgboost.DMatrix(x_tmp))
        elif isLSTM:
            x = x_tmp.values
            x = x.reshape((x.shape[0], x.shape[1], 1))
            y_pred = clf.predict(x)
        else:
            y_pred = clf.predict(x_tmp.as_matrix())
        
        # Directional Accuracy Pred
        changes = np.reshape(y_pred, (-1,1)) -  np.reshape(x_tmp['Close(t)'].values,(-1,1))
        y_pred_da = []
        for change in changes:
            y_pred_da.append(1 if change >= 0 else 0)
        
        RMSE = np.sqrt(mean_squared_error(y_tmp, y_pred))
        MAE = mean_absolute_error(y_tmp, y_pred)
        MAPE = mean_absolute_percentage_error(y_tmp, y_pred)
        DA = accuracy_score(y_true_da, y_pred_da)
        print(stock, "\tRMSE: %.2f\t MAE: %.2f \tMAPE: %.2f \tDA: %.2f" % (RMSE, MAE, MAPE, DA))
        RMSEs.append(RMSE)
        MAEs.append(MAE)
        MAPEs.append(MAPE)
        DAs.append(DA)
    
    print('\nmean RMSE:', round(np.mean(RMSEs),2))
    print('mean MAE:', round(np.mean(MAEs),2))
    print('mean MAPE:', round(np.mean(MAPEs),2))
    print('mean DA:', round(np.mean(DAs),4))

In [10]:
def ensemble_evaluator(bagging, ada_dt, ada_rf,  xgb, stack, stack_da, df_test, le, feature_importances, feature_importances_da):
    RMSEs, MAEs, MAPEs, DAs = [], [], [], []
    results = []
    for stock in target_stocks:
        x_tmp = df_test.loc[df_test['Ticker'] == le.transform([stock])[0]].copy()
        
        # Directional Accuracy
        changes = x_tmp[Horizon] - x_tmp['Close(t)']
        y_true_da = []
        for change in changes:
            y_true_da.append(1 if change >= 0 else 0)
        
        y_tmp = x_tmp[Horizon].values.reshape(-1,1)
        x_tmp = x_tmp.drop([Horizon], axis=1)        
        
        # Prediction
        y_pred = np.concatenate((
                                 bagging.predict(x_tmp).reshape(-1,1),
                                 ada_dt.predict(x_tmp).reshape(-1,1),
                                 ada_rf.predict(x_tmp).reshape(-1,1),
                                 xgb.predict(xgboost.DMatrix(x_tmp)).reshape(-1,1)), 
                                axis=1)
        
        df_pred = pd.DataFrame.from_records(y_pred).round(2)
        df_pred.columns = ['Bagging_DT', 'Ada_DT', 'Ada_RF', 'XGB']
        df_pred.head()

        # Directional Accuracy Pred
        close_t = np.reshape(x_tmp['Close(t)'].values, (-1, 1))
        y_changes = np.concatenate((
                                    np.array(y_pred[:,0]).reshape(-1,1)-close_t,
                                    np.array(y_pred[:,1]).reshape(-1,1)-close_t,
                                    np.array(y_pred[:,2]).reshape(-1,1)-close_t,
                                    np.array(y_pred[:,3]).reshape(-1,1)-close_t,
                                   ), axis=1)
        y_pred_da = []
        for row in y_changes:
            tmp_row = []
            for change in row:
                tmp_row.append(1 if change>=0 else 0)
            y_pred_da.append(tmp_row)

        df_pred_da = pd.DataFrame.from_records(y_pred_da)
        df_pred_da.columns = ['Bagging_DT', 'Ada_DT', 'Ada_RF', 'XGB']
        df_pred_da.head()
        
        
        
        df_pred['weight'] = (
                     df_pred['Bagging_DT']*feature_importances[0] + 
                     df_pred['Ada_DT']*feature_importances[1] + 
                     df_pred['Ada_RF']*feature_importances[2] + 
                     df_pred['XGB']*feature_importances[3]
        )
        
        df_pred_da['vote'] = (
                     df_pred_da['Bagging_DT']*feature_importances_da[0] + 
                     df_pred_da['Ada_DT']*feature_importances_da[1] + 
                     df_pred_da['Ada_RF']*feature_importances_da[2] + 
                     df_pred_da['XGB']*feature_importances_da[3]
        ).round(0).astype(int)

        y_pred = stack.predict(y_pred).reshape(-1,1)
        y_pred_da = stack_da.predict(y_pred_da).reshape(-1,1).round(0).astype(int)
#         y_pred = df_pred['weight'].values.reshape(-1,1)
#         y_pred_da = df_pred_da['vote'].values.reshape(-1,1)
        
        RMSE = np.sqrt(mean_squared_error(y_tmp, y_pred))
        MAE = mean_absolute_error(y_tmp, y_pred)
        MAPE = mean_absolute_percentage_error(y_tmp, y_pred)
        DA = accuracy_score(y_true_da, y_pred_da)
        print(stock, "\tRMSE: %.2f\t MAE: %.2f \tMAPE: %.2f \tDA: %.2f" % (RMSE, MAE, MAPE, DA))
        RMSEs.append(RMSE)
        MAEs.append(MAE)
        MAPEs.append(MAPE)
        DAs.append(DA)
    
    print('\nmean RMSE:', round(np.mean(RMSEs),2))
    print('mean MAE:', round(np.mean(MAEs),2))
    print('mean MAPE:', round(np.mean(MAPEs),2))
    print('mean DA:', round(np.mean(DAs),4))

# Ensemble

In [112]:
ensemble_evaluator( 
                   bagging,
                   adaboost_dt_regr, 
                   adaboost_rf_regr, 
                   xgb,
                   stack, stack_da,
                   x_test, le, feature_importances, feature_importances_da)

BANPU 	RMSE: 0.59	 MAE: 0.43 	MAPE: 2.03 	DA: 0.43
IRPC 	RMSE: 0.14	 MAE: 0.12 	MAPE: 1.60 	DA: 0.50
PTT 	RMSE: 7.40	 MAE: 5.61 	MAPE: 1.18 	DA: 0.53
BBL 	RMSE: 2.59	 MAE: 2.06 	MAPE: 0.98 	DA: 0.47
KBANK 	RMSE: 3.30	 MAE: 2.62 	MAPE: 1.13 	DA: 0.49
SCB 	RMSE: 2.15	 MAE: 1.69 	MAPE: 1.09 	DA: 0.53
AOT 	RMSE: 1.40	 MAE: 1.02 	MAPE: 1.45 	DA: 0.49
THAI 	RMSE: 0.28	 MAE: 0.22 	MAPE: 1.33 	DA: 0.51
CPF 	RMSE: 0.30	 MAE: 0.24 	MAPE: 0.99 	DA: 0.43
MINT 	RMSE: 0.61	 MAE: 0.45 	MAPE: 1.04 	DA: 0.67
TU 	RMSE: 0.36	 MAE: 0.25 	MAPE: 1.24 	DA: 0.53
SCC 	RMSE: 4.54	 MAE: 3.99 	MAPE: 0.81 	DA: 0.59
CPN 	RMSE: 1.49	 MAE: 1.21 	MAPE: 1.43 	DA: 0.52
CK 	RMSE: 0.40	 MAE: 0.30 	MAPE: 1.10 	DA: 0.64
CPALL 	RMSE: 1.27	 MAE: 0.99 	MAPE: 1.27 	DA: 0.43
HMPRO 	RMSE: 0.35	 MAE: 0.28 	MAPE: 2.01 	DA: 0.52
BDMS 	RMSE: 0.29	 MAE: 0.21 	MAPE: 0.96 	DA: 0.52
BH 	RMSE: 3.31	 MAE: 2.47 	MAPE: 1.27 	DA: 0.51
ADVANC 	RMSE: 1.91	 MAE: 1.48 	MAPE: 0.77 	DA: 0.55
JAS 	RMSE: 0.20	 MAE: 0.15 	MAPE: 2.05 	DA: 0.40
TRUE 	RM

# Linear Regression

In [11]:
from sklearn import linear_model

lineregr = linear_model.LinearRegression()
lineregr.fit(x_train, y_train)

evaluator(lineregr, x_test, le)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

BANPU 	RMSE: 1.00	 MAE: 0.74 	MAPE: 8.20 	DA: 0.59
IRPC 	RMSE: 0.91	 MAE: 0.73 	MAPE: 10.84 	DA: 0.52
PTT 	RMSE: 6.44	 MAE: 4.66 	MAPE: 4.94 	DA: 0.66
BBL 	RMSE: 1.76	 MAE: 1.47 	MAPE: 1.88 	DA: 0.57
KBANK 	RMSE: 3.48	 MAE: 2.70 	MAPE: 2.46 	DA: 0.43
SCB 	RMSE: 2.08	 MAE: 1.64 	MAPE: 3.00 	DA: 0.57
AOT 	RMSE: 1.51	 MAE: 1.16 	MAPE: 4.24 	DA: 0.55
THAI 	RMSE: 0.73	 MAE: 0.50 	MAPE: 4.97 	DA: 0.56
CPF 	RMSE: 0.88	 MAE: 0.63 	MAPE: 3.50 	DA: 0.54
MINT 	RMSE: 1.09	 MAE: 0.78 	MAPE: 2.47 	DA: 0.61
TU 	RMSE: 1.02	 MAE: 0.78 	MAPE: 4.55 	DA: 0.53
SCC 	RMSE: 4.42	 MAE: 3.53 	MAPE: 1.61 	DA: 0.63
CPN 	RMSE: 1.45	 MAE: 1.15 	MAPE: 2.25 	DA: 0.50
CK 	RMSE: 1.03	 MAE: 0.72 	MAPE: 3.73 	DA: 0.64
CPALL 	RMSE: 1.23	 MAE: 0.95 	MAPE: 2.38 	DA: 0.65
HMPRO 	RMSE: 1.05	 MAE: 0.75 	MAPE: 7.29 	DA: 0.72
BDMS 	RMSE: 0.95	 MAE: 0.65 	MAPE: 3.69 	DA: 0.55
BH 	RMSE: 2.94	 MAE: 2.27 	MAPE: 1.88 	DA: 0.58
ADVANC 	RMSE: 2.20	 MAE: 1.73 	MAPE: 2.12 	DA: 0.47
JAS 	RMSE: 0.95	 MAE: 0.66 	MAPE: 11.27 	DA: 0.60
TRUE 	

# Support Vector Regressor

In [None]:
# from sklearn.svm import SVR
# svr = SVR()
# svr.fit(x_train, y_train)

# evaluator(svr, x_test, le)

# Decistion Tree Regressor

In [12]:
from sklearn import tree

decis_tree_regr = tree.DecisionTreeRegressor(max_depth=10)
decis_tree_regr.fit(x_train, y_train.values.ravel())
evaluator(decis_tree_regr, x_test, le)

DecisionTreeRegressor(criterion='mse', max_depth=10, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

BANPU 	RMSE: 0.72	 MAE: 0.52 	MAPE: 2.47 	DA: 0.37
IRPC 	RMSE: 0.15	 MAE: 0.12 	MAPE: 1.70 	DA: 0.48
PTT 	RMSE: 10.92	 MAE: 8.81 	MAPE: 1.87 	DA: 0.47
BBL 	RMSE: 3.82	 MAE: 3.01 	MAPE: 1.44 	DA: 0.45
KBANK 	RMSE: 4.86	 MAE: 3.86 	MAPE: 1.66 	DA: 0.49
SCB 	RMSE: 2.64	 MAE: 1.88 	MAPE: 1.22 	DA: 0.55
AOT 	RMSE: 1.66	 MAE: 1.22 	MAPE: 1.76 	DA: 0.47
THAI 	RMSE: 0.28	 MAE: 0.23 	MAPE: 1.40 	DA: 0.49
CPF 	RMSE: 0.29	 MAE: 0.21 	MAPE: 0.87 	DA: 0.54
MINT 	RMSE: 0.80	 MAE: 0.60 	MAPE: 1.38 	DA: 0.61
TU 	RMSE: 0.34	 MAE: 0.24 	MAPE: 1.19 	DA: 0.69
SCC 	RMSE: 5.68	 MAE: 4.47 	MAPE: 0.91 	DA: 0.52
CPN 	RMSE: 1.46	 MAE: 1.23 	MAPE: 1.47 	DA: 0.48
CK 	RMSE: 0.45	 MAE: 0.31 	MAPE: 1.16 	DA: 0.67
CPALL 	RMSE: 2.62	 MAE: 2.10 	MAPE: 2.68 	DA: 0.41
HMPRO 	RMSE: 0.37	 MAE: 0.28 	MAPE: 2.05 	DA: 0.41
BDMS 	RMSE: 0.26	 MAE: 0.20 	MAPE: 0.93 	DA: 0.43
BH 	RMSE: 4.37	 MAE: 3.08 	MAPE: 1.57 	DA: 0.40
ADVANC 	RMSE: 3.79	 MAE: 2.42 	MAPE: 1.26 	DA: 0.49
JAS 	RMSE: 0.21	 MAE: 0.14 	MAPE: 1.87 	DA: 0.50
TRUE 	R

# Random Forest Regrssor

In [13]:
from sklearn import ensemble

rnd_forest_regr = ensemble.RandomForestRegressor(n_jobs=-1)
rnd_forest_regr.fit(x_train, y_train.values.ravel())

evaluator(rnd_forest_regr, x_test, le)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

BANPU 	RMSE: 0.52	 MAE: 0.37 	MAPE: 1.75 	DA: 0.59
IRPC 	RMSE: 0.14	 MAE: 0.12 	MAPE: 1.61 	DA: 0.43
PTT 	RMSE: 6.96	 MAE: 5.57 	MAPE: 1.18 	DA: 0.57
BBL 	RMSE: 2.19	 MAE: 1.62 	MAPE: 0.78 	DA: 0.57
KBANK 	RMSE: 3.61	 MAE: 2.80 	MAPE: 1.21 	DA: 0.51
SCB 	RMSE: 2.13	 MAE: 1.66 	MAPE: 1.08 	DA: 0.53
AOT 	RMSE: 1.41	 MAE: 1.07 	MAPE: 1.53 	DA: 0.51
THAI 	RMSE: 0.29	 MAE: 0.24 	MAPE: 1.40 	DA: 0.40
CPF 	RMSE: 0.28	 MAE: 0.21 	MAPE: 0.88 	DA: 0.63
MINT 	RMSE: 0.58	 MAE: 0.45 	MAPE: 1.04 	DA: 0.72
TU 	RMSE: 0.32	 MAE: 0.25 	MAPE: 1.22 	DA: 0.61
SCC 	RMSE: 4.88	 MAE: 3.77 	MAPE: 0.77 	DA: 0.48
CPN 	RMSE: 1.29	 MAE: 1.05 	MAPE: 1.24 	DA: 0.54
CK 	RMSE: 0.42	 MAE: 0.30 	MAPE: 1.09 	DA: 0.53
CPALL 	RMSE: 1.10	 MAE: 0.86 	MAPE: 1.11 	DA: 0.46
HMPRO 	RMSE: 0.31	 MAE: 0.24 	MAPE: 1.75 	DA: 0.50
BDMS 	RMSE: 0.26	 MAE: 0.20 	MAPE: 0.92 	DA: 0.57
BH 	RMSE: 3.72	 MAE: 2.91 	MAPE: 1.49 	DA: 0.40
ADVANC 	RMSE: 2.95	 MAE: 2.18 	MAPE: 1.13 	DA: 0.55
JAS 	RMSE: 0.18	 MAE: 0.13 	MAPE: 1.74 	DA: 0.50
TRUE 	RM

# Bagging Regressor

In [14]:
bagging = ensemble.BaggingRegressor(base_estimator=None,
                                    n_estimators=30,n_jobs=-1)
bagging.fit(x_train, y_train.values.ravel())
evaluator(bagging, x_test, le)

BaggingRegressor(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=30, n_jobs=-1, oob_score=False, random_state=None,
         verbose=0, warm_start=False)

BANPU 	RMSE: 0.57	 MAE: 0.41 	MAPE: 1.94 	DA: 0.46
IRPC 	RMSE: 0.14	 MAE: 0.11 	MAPE: 1.57 	DA: 0.46
PTT 	RMSE: 7.08	 MAE: 5.39 	MAPE: 1.14 	DA: 0.60
BBL 	RMSE: 1.98	 MAE: 1.58 	MAPE: 0.76 	DA: 0.53
KBANK 	RMSE: 3.22	 MAE: 2.64 	MAPE: 1.14 	DA: 0.60
SCB 	RMSE: 2.01	 MAE: 1.56 	MAPE: 1.01 	DA: 0.64
AOT 	RMSE: 1.42	 MAE: 1.02 	MAPE: 1.46 	DA: 0.43
THAI 	RMSE: 0.27	 MAE: 0.22 	MAPE: 1.30 	DA: 0.47
CPF 	RMSE: 0.26	 MAE: 0.20 	MAPE: 0.81 	DA: 0.52
MINT 	RMSE: 0.70	 MAE: 0.55 	MAPE: 1.26 	DA: 0.61
TU 	RMSE: 0.33	 MAE: 0.23 	MAPE: 1.14 	DA: 0.64
SCC 	RMSE: 4.52	 MAE: 3.63 	MAPE: 0.74 	DA: 0.54
CPN 	RMSE: 1.48	 MAE: 1.18 	MAPE: 1.39 	DA: 0.54
CK 	RMSE: 0.41	 MAE: 0.29 	MAPE: 1.09 	DA: 0.53
CPALL 	RMSE: 1.22	 MAE: 0.95 	MAPE: 1.23 	DA: 0.43
HMPRO 	RMSE: 0.29	 MAE: 0.24 	MAPE: 1.71 	DA: 0.48
BDMS 	RMSE: 0.27	 MAE: 0.20 	MAPE: 0.93 	DA: 0.36
BH 	RMSE: 3.55	 MAE: 2.76 	MAPE: 1.41 	DA: 0.47
ADVANC 	RMSE: 2.28	 MAE: 1.90 	MAPE: 0.99 	DA: 0.34
JAS 	RMSE: 0.18	 MAE: 0.14 	MAPE: 1.88 	DA: 0.45
TRUE 	RM

# AdaBoost Regressor

In [15]:
adaboost_dt_regr = ensemble.AdaBoostRegressor(base_estimator=tree.DecisionTreeRegressor(),
                                           learning_rate=1, 
                                           n_estimators=10, 
                                           loss='linear')
adaboost_dt_regr.fit(x_train, y_train.values.ravel())

evaluator(adaboost_dt_regr, x_test, le)

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
         learning_rate=1, loss='linear', n_estimators=10,
         random_state=None)

BANPU 	RMSE: 0.53	 MAE: 0.40 	MAPE: 1.87 	DA: 0.43
IRPC 	RMSE: 0.14	 MAE: 0.11 	MAPE: 1.55 	DA: 0.46
PTT 	RMSE: 7.48	 MAE: 5.87 	MAPE: 1.24 	DA: 0.55
BBL 	RMSE: 2.72	 MAE: 2.17 	MAPE: 1.04 	DA: 0.51
KBANK 	RMSE: 3.30	 MAE: 2.62 	MAPE: 1.12 	DA: 0.49
SCB 	RMSE: 2.20	 MAE: 1.73 	MAPE: 1.13 	DA: 0.51
AOT 	RMSE: 1.48	 MAE: 1.06 	MAPE: 1.52 	DA: 0.51
THAI 	RMSE: 0.28	 MAE: 0.22 	MAPE: 1.31 	DA: 0.47
CPF 	RMSE: 0.34	 MAE: 0.28 	MAPE: 1.14 	DA: 0.41
MINT 	RMSE: 0.62	 MAE: 0.45 	MAPE: 1.04 	DA: 0.61
TU 	RMSE: 0.36	 MAE: 0.26 	MAPE: 1.24 	DA: 0.53
SCC 	RMSE: 4.49	 MAE: 3.91 	MAPE: 0.79 	DA: 0.59
CPN 	RMSE: 1.73	 MAE: 1.41 	MAPE: 1.68 	DA: 0.46
CK 	RMSE: 0.41	 MAE: 0.28 	MAPE: 1.05 	DA: 0.56
CPALL 	RMSE: 1.35	 MAE: 1.07 	MAPE: 1.37 	DA: 0.50
HMPRO 	RMSE: 0.34	 MAE: 0.26 	MAPE: 1.86 	DA: 0.54
BDMS 	RMSE: 0.27	 MAE: 0.20 	MAPE: 0.92 	DA: 0.55
BH 	RMSE: 3.34	 MAE: 2.48 	MAPE: 1.27 	DA: 0.56
ADVANC 	RMSE: 1.96	 MAE: 1.56 	MAPE: 0.81 	DA: 0.57
JAS 	RMSE: 0.22	 MAE: 0.16 	MAPE: 2.12 	DA: 0.36
TRUE 	RM

In [16]:
adaboost_rf_regr = ensemble.AdaBoostRegressor(base_estimator=ensemble.RandomForestRegressor(n_jobs=-1),
                                           learning_rate=1, 
                                           n_estimators=10, 
                                           loss='linear')
adaboost_rf_regr.fit(x_train, y_train.values.ravel())

evaluator(adaboost_rf_regr, x_test, le)

AdaBoostRegressor(base_estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
         learning_rate=1, loss='linear', n_estimators=10,
         random_state=None)

BANPU 	RMSE: 0.51	 MAE: 0.38 	MAPE: 1.80 	DA: 0.48
IRPC 	RMSE: 0.12	 MAE: 0.10 	MAPE: 1.44 	DA: 0.43
PTT 	RMSE: 6.65	 MAE: 5.01 	MAPE: 1.06 	DA: 0.60
BBL 	RMSE: 2.07	 MAE: 1.67 	MAPE: 0.80 	DA: 0.51
KBANK 	RMSE: 3.17	 MAE: 2.57 	MAPE: 1.11 	DA: 0.64
SCB 	RMSE: 1.99	 MAE: 1.51 	MAPE: 0.98 	DA: 0.55
AOT 	RMSE: 1.47	 MAE: 1.02 	MAPE: 1.46 	DA: 0.47
THAI 	RMSE: 0.24	 MAE: 0.19 	MAPE: 1.12 	DA: 0.49
CPF 	RMSE: 0.27	 MAE: 0.20 	MAPE: 0.83 	DA: 0.48
MINT 	RMSE: 0.65	 MAE: 0.50 	MAPE: 1.15 	DA: 0.67
TU 	RMSE: 0.33	 MAE: 0.25 	MAPE: 1.24 	DA: 0.53
SCC 	RMSE: 4.61	 MAE: 3.72 	MAPE: 0.76 	DA: 0.54
CPN 	RMSE: 1.40	 MAE: 1.12 	MAPE: 1.33 	DA: 0.54
CK 	RMSE: 0.40	 MAE: 0.28 	MAPE: 1.05 	DA: 0.53
CPALL 	RMSE: 0.93	 MAE: 0.74 	MAPE: 0.95 	DA: 0.41
HMPRO 	RMSE: 0.29	 MAE: 0.24 	MAPE: 1.71 	DA: 0.43
BDMS 	RMSE: 0.25	 MAE: 0.18 	MAPE: 0.86 	DA: 0.48
BH 	RMSE: 3.71	 MAE: 2.75 	MAPE: 1.41 	DA: 0.47
ADVANC 	RMSE: 2.21	 MAE: 1.74 	MAPE: 0.90 	DA: 0.45
JAS 	RMSE: 0.17	 MAE: 0.13 	MAPE: 1.70 	DA: 0.55
TRUE 	RM

# Gradient Boosting Regressor

In [None]:
gbr = ensemble.GradientBoostingRegressor(n_estimators=100, 
                                         learning_rate=0.1,
                                         max_depth=6,
                                         min_samples_split=2,
                                         loss='ls',
                                        )
gbr.fit(x_train, y_train.values.ravel())

evaluator(gbr, x_test, le)

# XGBoost Regressor

In [17]:
import xgboost
from sklearn.model_selection import train_test_split

d_train, d_valid, y_d_train, y_d_valid = train_test_split(x_train, y_train, test_size=0.1, random_state=10)
len(d_train), len(d_valid)

d_train = xgboost.DMatrix(d_train, label=y_d_train)
d_valid = xgboost.DMatrix(d_valid, label=y_d_valid)

(16115, 1791)

Parameters
http://xgboost.readthedocs.io/en/latest//parameter.html

In [18]:
params = {
    'booster':'dart',
    'max_depth': 4,
    'learning_rate': 0.05,
    'subsample': 1,
    'objective': 'reg:tweedie',
    'eval_metric': 'mae',
    'reg_lambda': 0.8,
    'reg_alpha': 0.2,
    'silent': 1,
    'sample_type':"weighted"
}

xgb = xgboost.train(params, d_train, 
                    num_boost_round=5000, 
                    evals=[(d_train, 'train'), (d_valid, 'valid')], 
                    early_stopping_rounds=50,
                    verbose_eval=100
                   )

[0]	train-mae:109.797	valid-mae:106.956
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 50 rounds.
[100]	train-mae:7.76727	valid-mae:7.59879
[200]	train-mae:1.09736	valid-mae:1.21573
[300]	train-mae:1.02099	valid-mae:1.21064
[400]	train-mae:0.966042	valid-mae:1.20632
[500]	train-mae:0.920058	valid-mae:1.20394
Stopping. Best iteration:
[482]	train-mae:0.928511	valid-mae:1.20271



In [19]:
evaluator(xgb, x_test, le, isXGB=True)

BANPU 	RMSE: 0.52	 MAE: 0.40 	MAPE: 1.89 	DA: 0.33
IRPC 	RMSE: 0.13	 MAE: 0.11 	MAPE: 1.47 	DA: 0.41
PTT 	RMSE: 7.91	 MAE: 5.75 	MAPE: 1.21 	DA: 0.55
BBL 	RMSE: 1.74	 MAE: 1.54 	MAPE: 0.74 	DA: 0.45
KBANK 	RMSE: 2.96	 MAE: 2.45 	MAPE: 1.05 	DA: 0.68
SCB 	RMSE: 2.13	 MAE: 1.62 	MAPE: 1.05 	DA: 0.60
AOT 	RMSE: 1.62	 MAE: 1.15 	MAPE: 1.63 	DA: 0.40
THAI 	RMSE: 0.25	 MAE: 0.21 	MAPE: 1.23 	DA: 0.42
CPF 	RMSE: 0.25	 MAE: 0.19 	MAPE: 0.79 	DA: 0.57
MINT 	RMSE: 0.68	 MAE: 0.50 	MAPE: 1.16 	DA: 0.50
TU 	RMSE: 0.31	 MAE: 0.24 	MAPE: 1.15 	DA: 0.58
SCC 	RMSE: 4.53	 MAE: 3.56 	MAPE: 0.72 	DA: 0.59
CPN 	RMSE: 1.09	 MAE: 0.88 	MAPE: 1.05 	DA: 0.54
CK 	RMSE: 0.34	 MAE: 0.22 	MAPE: 0.83 	DA: 0.71
CPALL 	RMSE: 1.00	 MAE: 0.84 	MAPE: 1.07 	DA: 0.33
HMPRO 	RMSE: 0.26	 MAE: 0.21 	MAPE: 1.50 	DA: 0.59
BDMS 	RMSE: 0.22	 MAE: 0.16 	MAPE: 0.76 	DA: 0.57
BH 	RMSE: 3.00	 MAE: 2.25 	MAPE: 1.15 	DA: 0.51
ADVANC 	RMSE: 2.70	 MAE: 2.18 	MAPE: 1.13 	DA: 0.40
JAS 	RMSE: 0.18	 MAE: 0.14 	MAPE: 1.90 	DA: 0.48
TRUE 	RM

# Save ML models

In [20]:
pickle.dump(decis_tree_regr, open('models/decis_tree_regr_news_all.pkl', 'wb'))
pickle.dump(rnd_forest_regr, open('models/rnd_forest_regr_news_all.pkl', 'wb'))
pickle.dump(bagging, open('models/bagging_regr_news_all.pkl', 'wb'))
pickle.dump(adaboost_dt_regr, open('models/adaboost_dt_regr_news_all.pkl', 'wb'))
pickle.dump(adaboost_rf_regr, open('models/adaboost_rf_regr_news_all.pkl', 'wb'))
pickle.dump(xgb, open('models/xgb_news_all.pkl', 'wb'))

# Ensemble Stacking

In [21]:
dt = pickle.load(open('models/decis_tree_regr_news_all.pkl', 'rb'))
rf = pickle.load(open('models/rnd_forest_regr_news_all.pkl', 'rb'))
bagging = pickle.load(open('models/bagging_regr_news_all.pkl', 'rb'))
ada_dt = pickle.load(open('models/adaboost_dt_regr_news_all.pkl', 'rb'))
ada_rf = pickle.load(open('models/adaboost_rf_regr_news_all.pkl', 'rb'))
xgb = pickle.load(open('models/xgb_news_all.pkl', 'rb'))

In [22]:
x_train_stack = np.concatenate((
                         bagging.predict(x_train).reshape(-1,1),
                         ada_dt.predict(x_train).reshape(-1,1),
                         ada_rf.predict(x_train).reshape(-1,1),
                         xgb.predict(xgboost.DMatrix(x_train)).reshape(-1,1)), axis=1)

In [23]:
y_test = x_test[Horizon].values.reshape(-1,1)

x_test_stack = np.concatenate((
                         bagging.predict(x_test.drop(['Close(t+1)'], axis=1)).reshape(-1,1),
                         ada_dt.predict(x_test.drop(['Close(t+1)'], axis=1)).reshape(-1,1),
                         ada_rf.predict(x_test.drop(['Close(t+1)'], axis=1)).reshape(-1,1),
                         xgb.predict(xgboost.DMatrix((x_test.drop(['Close(t+1)'], axis=1)))).reshape(-1,1)), axis=1)

In [24]:
stack = ensemble.RandomForestRegressor(n_jobs=-1)
stack.fit(x_train_stack, y_train.values.ravel())

y_pred_stack = stack.predict(x_test_stack).reshape(-1,1)

for i in range(x_test_stack.shape[1]):
    RMSE = np.sqrt(mean_squared_error(y_test, x_test_stack[:,i]))
    MAE = mean_absolute_error(y_test, x_test_stack[:,i])
    MAPE = mean_absolute_percentage_error(y_test, x_test_stack[:,i].reshape(-1,1))
    print("RMSE: %.2f \tMAE: %.2f \tMAPE: %.2f" % (RMSE, MAE, MAPE))
    
RMSE = np.sqrt(mean_squared_error(y_test, y_pred_stack))
MAE = mean_absolute_error(y_test, y_pred_stack)
MAPE = mean_absolute_percentage_error(y_test, y_pred_stack)
print("\nRMSE: %.2f \tMAE: %.2f \tMAPE: %.2f" % (RMSE, MAE, MAPE))

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

RMSE: 2.36 	MAE: 1.23 	MAPE: 1.26
RMSE: 2.45 	MAE: 1.29 	MAPE: 1.33
RMSE: 2.30 	MAE: 1.19 	MAPE: 1.20
RMSE: 9.24 	MAE: 4.73 	MAPE: 2.60

RMSE: 2.41 	MAE: 1.25 	MAPE: 1.30


In [25]:
feature_importances = stack.feature_importances_
feature_importances

array([0.02431273, 0.86737258, 0.03057883, 0.07773587])

# Stack DA

In [26]:
close_t = np.reshape(x_train['Close(t)'].values, (-1, 1))
changes = y_train.values.reshape(-1,1) - close_t
y_train_da = np.array([1 if change >= 0 else 0 for change in changes]).reshape(-1,1)

x_changes = np.concatenate((
                            np.array(x_train_stack[:,0]).reshape(-1,1)-close_t,
                            np.array(x_train_stack[:,1]).reshape(-1,1)-close_t,
                            np.array(x_train_stack[:,2]).reshape(-1,1)-close_t,
                            np.array(x_train_stack[:,3]).reshape(-1,1)-close_t,
                           ), axis=1)
x_train_stack_da = []
for row in x_changes:
    tmp_row = []
    for change in row:
        tmp_row.append(1 if change>=0 else 0)
    x_train_stack_da.append(tmp_row)

In [27]:
close_t = np.reshape(x_test['Close(t)'].values, (-1, 1))
changes = y_test - close_t
y_test_da = np.array([1 if change >= 0 else 0 for change in changes]).reshape(-1,1)

x_changes = np.concatenate((
                            np.array(x_test_stack[:,0]).reshape(-1,1)-close_t,
                            np.array(x_test_stack[:,1]).reshape(-1,1)-close_t,
                            np.array(x_test_stack[:,2]).reshape(-1,1)-close_t,
                            np.array(x_test_stack[:,3]).reshape(-1,1)-close_t,
                           ), axis=1)
x_test_stack_da = []
for row in x_changes:
    tmp_row = []
    for change in row:
        tmp_row.append(1 if change>=0 else 0)
    x_test_stack_da.append(tmp_row)

x_test_stack_da = np.array(x_test_stack_da)

In [30]:
stack_da = ensemble.GradientBoostingRegressor()
stack_da.fit(x_train_stack_da, y_train_da.ravel())

y_stack_da = stack_da.predict(x_test_stack_da).reshape(-1,1).round(0).astype(int)

for i in range(x_test_stack_da.shape[1]):
    acc = accuracy_score(y_test_da, x_test_stack_da[:,i])
    print("Accuracy: %.4f" % (acc))
    
acc = accuracy_score(y_test_da, y_stack_da)
print("\nAccuracy: %.4f" % (acc))

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

Accuracy: 0.5068
Accuracy: 0.5100
Accuracy: 0.5121
Accuracy: 0.4124

Accuracy: 0.5414


In [31]:
feature_importances_da = stack_da.feature_importances_
feature_importances_da

array([0.12667901, 0.61525446, 0.13835091, 0.11971561])

# Visualize Error

In [110]:
def visualize(ticker):
    x_tmp = x_test.loc[x_test['Ticker'] == le.transform([ticker])[0]].copy()

    y_tmp = x_tmp[Horizon].values.reshape(-1,1)
    x_tmp = x_tmp.drop([Horizon], axis=1)

    y_pred = xgb.predict(xgboost.DMatrix((x_tmp))).reshape(-1,1)

    df = np.concatenate((y_tmp,
                         y_pred), axis=1)

    df = pd.DataFrame.from_records(df)
    df.columns = ['actual', 'predicted']
    df['AE'] = 100*(df['actual']-df['predicted']).abs()/df['actual']



    trace_1 = go.Scatter(
        x = x_tmp.index,
        y = df.actual,
        mode = 'lines',
        name = 'Actual',
        line = dict(width = 4)
    )

    trace_2 = go.Scatter(
        x = x_tmp.index,
        y = df.predicted,
        mode = 'lines',
        name = 'Predicted',
        line = dict(width = 4)
    )

    ln = make_pipeline(PolynomialFeatures(4), Ridge())
    ln = LinearRegression()
    ln.fit(df.index.values.reshape(-1,1), df.AE)


    trace_3 = go.Scatter(
        x = x_tmp.index,
        y = ln.predict(df.index.values.reshape(-1,1)),
        mode = 'lines',
        name = 'Trend Line',
        line = dict(width = 4)
    )
    trace_4 = go.Scatter(
        x = x_tmp.index,
        y = df.AE,
        mode = 'lines',
        name = 'Errors',
        line = dict(width = 4)
    )

    fig = tools.make_subplots(rows=2, cols=1, subplot_titles=(ticker+': Actual vs. Predicted', 'Absolute Error'))

    fig.append_trace(trace_1, 1, 1)
    fig.append_trace(trace_2, 1, 1)
    fig.append_trace(trace_3, 2, 1)
    fig.append_trace(trace_4, 2, 1)

    fig['layout'].update(height=1000, width=1200,  
                         paper_bgcolor='rgba(0,0,0,0)', 
                         plot_bgcolor='rgba(0,0,0,0)', 
                         font=dict(size=16, color='#bfbfbf'))

    fig['layout']['yaxis1'].update(title='Price')
    fig['layout']['yaxis2'].update(title='Absolute Error')

    iplot(fig, filename='line-mode')

In [111]:
visualize('SCB')

This is the format of your plot grid:
[ (1,1) x1,y1 ]
[ (2,1) x2,y2 ]



# LSTM

In [16]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM

Using TensorFlow backend.


In [17]:
train_X = x_train.values
val_X = x_valid.values
# test_X = x_test.values

train_y = y_train.values
val_y = y_valid.values
# test_y = y_test.values

train_X = train_X.reshape(train_X.shape[0], train_X.shape[1], 1)
val_X = val_X.reshape(val_X.shape[0], val_X.shape[1], 1)
# test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))

print(train_X.shape, train_y.shape)
print(val_X.shape, val_y.shape)
# print(test_X.shape, test_y.shape)

(17396, 1017, 1) (17396, 1)
(3070, 1017, 1) (3070, 1)


In [18]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, input_shape=(train_X.shape[1], 1)))
model.add(Dropout(0.3))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.3))
# model.add(LSTM(256, return_sequences=True))
# model.add(Dropout(0.3))
# model.add(LSTM(256, return_sequences=True))
# model.add(Dropout(0.4))
model.add(LSTM(128))
model.add(Dense(1))
model.compile(loss='mean_absolute_error', optimizer='adam')

In [19]:
from keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping

checkpoint = ModelCheckpoint(filepath="model/LSTM.h5", 
                             monitor='val_loss', 
                             verbose=1, 
                             save_best_only=True, 
                             save_weights_only=False, 
                             mode='auto', 
                             period=1
                            )

earlystopping = EarlyStopping(monitor='val_loss', 
                      min_delta=0, 
                      patience=100,
                      verbose=1, 
                      mode='auto')

# access via $ tensorboard --logdir=./logs
tensorboard = TensorBoard(log_dir='./logs')

In [None]:
model.fit(x=train_X, 
          y=train_y,
          epochs=1000,
          batch_size=32,
          validation_data=(val_X, val_y),
          verbose=1,
          shuffle=False,
          callbacks=[checkpoint, earlystopping, tensorboard]
         )

In [None]:
evaluator(model, x_test, le, isLSTM=True)

In [None]:
from keras.layers import Bidirectional

In [None]:
model = Sequential()
model.add(Bidirectional(LSTM(128, return_sequences=True), input_shape=(2, train_X.shape[2])))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.4))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(1))
model.compile(loss='mean_absolute_error', optimizer='adam')

In [None]:
model.fit(x=train_X, 
          y=train_y,
          epochs=10000,
          batch_size=1024,
          validation_data=(val_X, val_y),
          verbose=1,
          shuffle=False,
          callbacks=[checkpoint, earlystopping, tensorboard]
         )

In [None]:
'hey'

In [None]:
evaluator(model, x_test, le, isLSTM=True)

In [None]:
data_dim = 16
timesteps = 8
num_classes = 10
np.random.random((10, timesteps, data_dim)).shape

In [None]:
train_X = x_train.values
val_X = x_valid.values
train_X.shape

train_X = train_X.reshape(train_X.shape[0], train_X.shape[1], 1)
train_X.shape