# Regression with Twitter (all)

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import sys
sys.version

'3.5.2 (default, Nov 23 2017, 16:37:01) \n[GCC 5.4.0 20160609]'

In [33]:
import pandas as pd
import os
import copy
import numpy as np
import xgboost

from pythainlp.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score
from sklearn import preprocessing

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

import plotly.graph_objs as go
from datetime import datetime, timedelta
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
% matplotlib inline

target_stocks = ['BANPU','IRPC','PTT','BBL','KBANK','SCB','AOT','THAI','CPF','MINT',
                 'TU','SCC','CPN','CK','CPALL','HMPRO','BDMS','BH','ADVANC','JAS','TRUE']

df_price = pd.read_csv('merged_2013_2018.csv')
df_price['Date'] = pd.to_datetime(df_price['Date'], format='%Y-%m-%d')
df_price = df_price.loc[df_price['Ticker'].isin(target_stocks)]
df_price['Date'] = df_price['Date'].dt.date
df_price = df_price.set_index('Date')
df_price.tail(3)
len(df_price)

df_twitter = pd.read_csv('data/twitter_all.csv')
df_twitter['Date'] = pd.to_datetime(df_twitter['Date'], format='%Y-%m-%d')
df_twitter = df_twitter.set_index('Date')
df_twitter = df_twitter[:'2018-2-8']
df_twitter.index = df_twitter.index.date
df_twitter = df_twitter.sort_index()
df_twitter.tail(3)
len(df_twitter)

Unnamed: 0_level_0,Ticker,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-02-16,THAI,15.6,15.9,15.6,15.6,2907700
2018-02-16,TRUE,6.4,6.4,6.3,6.3,35851300
2018-02-16,TU,20.0,20.5,20.0,20.1,7299300


26331

Unnamed: 0,Ticker,Text
2018-02-08,TU,มธ ขาด คน มา นั่ง ด้วย
2018-02-08,TU,มธ รังสิต ใคร รับ นวด บ้าง หอ ใน หอน อก ห้อง ใ...
2018-02-08,TU,วัน ที่ ก พน้อง เอง ก็ อยาก มี โมเม้น ไป มธ รั...


12741

# Lag & Horizon Construction

In [34]:
N_lags = 3
N_horizon = 1

In [35]:
df_train = []
df_test = []
for stock in tqdm_notebook(target_stocks):
    news_stocks = []
    df_stock = df_twitter.loc[df_twitter['Ticker'] == stock]
    prev_date = None
    prev_text = None
    
#     pbar = tqdm_notebook(total=len(df_stock))
    for date, row in df_stock.iterrows():
        if prev_date == None:
            prev_date = date
            prev_text = row['Text']
        elif prev_date != date:
            # horizon
            tmp_date = copy.deepcopy(prev_date)
            tmp_date += timedelta(days=1)
            prices = []
            count_lags = 0 
            while count_lags < N_horizon:
                price = df_price.loc[(df_price.index == tmp_date) & (df_price['Ticker'] == stock)].values
                tmp_date += timedelta(days=1)
                if len(price) == 0: continue
                prices.append(price[0][4]) # Close price next day(s)
                count_lags+=1
            
            # lag
            tmp_date = copy.deepcopy(prev_date)
            count_lags = 0 
            while count_lags <= N_lags:
                price = df_price.loc[(df_price.index == tmp_date) & (df_price['Ticker'] == stock)].values
                tmp_date -= timedelta(days=1)
                if len(price) == 0: continue
                for val in price[0][:-1]: 
                    if type(val) != str: prices.append(val)
                count_lags+=1

            news_stocks.append([prev_date, stock, prev_text] + prices)
            
            prev_date = date
            prev_text = row['Text']
        elif prev_date == date:
            prev_text += ' '+row['Text']
        
#         pbar.update(1)
#     pbar.close()

    news_stocks = pd.DataFrame.from_records(news_stocks)
    news_stocks.columns = ['Date', 'Ticker', 'Text', 
                           'Close(t+1)', # 'Close(t+2)','Close(t+3)','Close(t+4)','Close(t+5)',
                           'Open(t)', 'High(t)', 'Low(t)', 'Close(t)',
                           'Open(t-1)', 'High(t-1)', 'Low(t-1)', 'Close(t-1)',
                           'Open(t-2)', 'High(t-2)', 'Low(t-2)', 'Close(t-2)',
                           'Open(t-3)', 'High(t-3)', 'Low(t-3)', 'Close(t-3)',
#                            'Open(t-4)', 'High(t-4)', 'Low(t-4)', 'Close(t-4)',
#                            'Open(t-5)', 'High(t-5)', 'Low(t-5)', 'Close(t-5)'
                          ]
    news_stocks = news_stocks.set_index('Date')
    
    train_size = int(len(news_stocks) * 0.80)
    test_size = len(news_stocks) - train_size
    train, test = news_stocks.iloc[:train_size], news_stocks.iloc[train_size:]
    print(stock, ':\t',len(train), len(test))    
    df_train.append(train)
    df_test.append(test)
    

df_train = pd.concat(df_train, axis=0)
df_test = pd.concat(df_test, axis=0)

BANPU :	 89 23
IRPC :	 51 13
PTT :	 347 87
BBL :	 251 63
KBANK :	 202 51
SCB :	 595 149
AOT :	 255 64
THAI :	 6 2
CPF :	 143 36
MINT :	 68 17
TU :	 846 212
SCC :	 78 20
CPN :	 112 29
CK :	 251 63
CPALL :	 116 30
HMPRO :	 9 3
BDMS :	 80 21
BH :	 115 29
ADVANC :	 104 26
JAS :	 164 41
TRUE :	 427 107


In [36]:
# df_train.to_csv('data/df_train_news_all.csv')
# df_test.to_csv('data/df_test_news_all.csv')

# df_train = pd.read_csv('data/df_train_news_all.csv')
# df_test = pd.read_csv('data/df_test_news_all.csv')

# df_train = df_train.set_index('Date')
# df_test = df_test.set_index('Date')

len(df_train), len(df_test) 

df_train.head(3)
df_test.head(3)

(4309, 1086)

Unnamed: 0_level_0,Ticker,Text,Close(t+1),Open(t),High(t),Low(t),Close(t),Open(t-1),High(t-1),Low(t-1),Close(t-1),Open(t-2),High(t-2),Low(t-2),Close(t-2),Open(t-3),High(t-3),Low(t-3),Close(t-3)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2014-03-04,BANPU,วิตก ยูเครน น้ำมันดิบ พุ่ง วันนี้ หุ้น พลังงาน...,26.5,26.5,26.75,26.25,26.5,26.0,26.75,26.0,26.25,26.0,26.25,25.5,26.0,25.25,26.25,25.0,26.0
2014-03-21,BANPU,โบ รก แนะ หุ้น เด่น มี ลุ้น เด้ง ขึ้น วันนี้ โ...,26.0,26.0,26.25,25.75,26.25,25.75,26.0,25.5,26.0,26.25,26.5,25.75,25.75,26.0,26.5,26.0,26.0
2014-03-24,BANPU,คัด หุ้น เทคนิค เด่น โบ รก ลุ้น วิ่ง ชน แนว ต้...,26.25,26.25,26.5,25.75,26.0,26.0,26.25,25.75,26.25,25.75,26.0,25.5,26.0,26.25,26.5,25.75,25.75


Unnamed: 0_level_0,Ticker,Text,Close(t+1),Open(t),High(t),Low(t),Close(t),Open(t-1),High(t-1),Low(t-1),Close(t-1),Open(t-2),High(t-2),Low(t-2),Close(t-2),Open(t-3),High(t-3),Low(t-3),Close(t-3)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2017-02-16,BANPU,ลุ้น รีบาวด์ เคาะ หุ้น ร้อน ปันผล กำไร โต ลุ้น...,19.5,19.7,19.9,19.5,19.8,19.4,19.6,19.3,19.6,19.1,19.6,19.0,19.2,18.9,19.1,18.8,18.9
2017-02-24,BANPU,คง มุมมอง ต่อราคา ถ่านหิน หนุน กำไร ปี โต กว่า...,19.6,19.5,19.8,19.4,19.5,19.3,19.4,19.2,19.4,19.5,19.5,19.3,19.4,19.8,19.8,19.3,19.4
2017-03-03,BANPU,หุ้น อันดับ ที่ มี คน ค้นหา ข่าว มาก ที่สุด ปร...,19.7,20.1,20.1,19.8,20.1,20.0,20.2,19.9,20.1,19.7,19.8,19.6,19.8,19.7,19.8,19.6,19.6


# TF-IDF Vetorization

In [37]:
stop_words = stopwords.words('thai')
stop_words.remove('ขึ้น')
stop_words.remove('ลง')

vertorizer = TfidfVectorizer(stop_words=stop_words, 
                             max_df=0.9, 
                             min_df=2, 
                             max_features=3000)

tfidf_train = vertorizer.fit_transform(df_train['Text'])
tfidf_test = vertorizer.transform(df_test['Text'])

df_tfidf_train = pd.DataFrame.from_records(tfidf_train.toarray())
df_tfidf_test = pd.DataFrame.from_records(tfidf_test.toarray())

df_tfidf_train = df_tfidf_train.set_index(df_train.index)
df_tfidf_test = df_tfidf_test.set_index(df_test.index)

len(df_tfidf_train), len(df_tfidf_test)

# replace Text with TF-IDF vector
x_train = df_train.drop(['Text'], axis=1)
x_train = pd.concat([x_train, df_tfidf_train], axis=1)

x_test = df_test.drop(['Text'], axis=1)
x_test = pd.concat([x_test, df_tfidf_test], axis=1)

# Label Encoding
le = preprocessing.LabelEncoder()
x_train['Ticker'] = le.fit_transform(x_train['Ticker'])
x_test['Ticker'] = le.transform(x_test['Ticker'])
x_train.head(2)
x_test.head(2)
le.classes_

(4309, 1086)

Unnamed: 0_level_0,Ticker,Close(t+1),Open(t),High(t),Low(t),Close(t),Open(t-1),High(t-1),Low(t-1),Close(t-1),...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-03-04,2,26.5,26.5,26.75,26.25,26.5,26.0,26.75,26.0,26.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2014-03-21,2,26.0,26.0,26.25,25.75,26.25,25.75,26.0,25.5,26.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0_level_0,Ticker,Close(t+1),Open(t),High(t),Low(t),Close(t),Open(t-1),High(t-1),Low(t-1),Close(t-1),...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-02-16,2,19.5,19.7,19.9,19.5,19.8,19.4,19.6,19.3,19.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-02-24,2,19.6,19.5,19.8,19.4,19.5,19.3,19.4,19.2,19.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


array(['ADVANC', 'AOT', 'BANPU', 'BBL', 'BDMS', 'BH', 'CK', 'CPALL',
       'CPF', 'CPN', 'HMPRO', 'IRPC', 'JAS', 'KBANK', 'MINT', 'PTT',
       'SCB', 'SCC', 'THAI', 'TRUE', 'TU'], dtype=object)

# Create x_train and y_train

In [38]:
Horizon = 'Close(t+1)'
y_train = x_train[[Horizon]]
x_train = x_train.drop(['Close(t+1)'], axis=1).copy()
x_train.shape, y_train.shape

((4309, 3017), (4309, 1))

# Evaluate Each Stcok

In [39]:
def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def evaluator(clf, df_test, le, isXGB=False, isLSTM=False):
    RMSEs, MAEs, MAPEs, DAs = [], [], [], []
    results = []
    for stock in target_stocks:
        x_tmp = df_test.loc[df_test['Ticker'] == le.transform([stock])[0]].copy()
        
        y_tmp = x_tmp[Horizon].values
        
        # Directional Accuracy
        changes = x_tmp[Horizon] -  x_tmp['Close(t)']
        y_true_da = []
        for change in changes:
            y_true_da.append(1 if change >= 0 else 0)
                
        x_tmp = x_tmp.drop(['Close(t+1)'], axis=1)
        
        if isXGB:
            y_pred = clf.predict(xgboost.DMatrix(x_tmp))
        elif isLSTM:
            x = x_tmp.values
            x = x.reshape((x.shape[0], x.shape[1], 1))
            y_pred = clf.predict(x)
        else:
            y_pred = clf.predict(x_tmp.as_matrix())
        
        # Directional Accuracy Pred
        changes = np.reshape(y_pred, (-1,1)) -  np.reshape(x_tmp['Close(t)'].values,(-1,1))
        y_pred_da = []
        for change in changes:
            y_pred_da.append(1 if change >= 0 else 0)
        
        RMSE = np.sqrt(mean_squared_error(y_tmp, y_pred))
        MAE = mean_absolute_error(y_tmp, y_pred)
        MAPE = mean_absolute_percentage_error(y_tmp, y_pred)
        DA = accuracy_score(y_true_da, y_pred_da)
        print(stock, "\tRMSE: %.2f\t MAE: %.2f \tMAPE: %.2f \tDA: %.2f" % (RMSE, MAE, MAPE, DA))
        RMSEs.append(RMSE)
        MAEs.append(MAE)
        MAPEs.append(MAPE)
        DAs.append(DA)
    
    print('\nmean RMSE:', round(np.mean(RMSEs),2))
    print('mean MAE:', round(np.mean(MAEs),2))
    print('mean MAPE:', round(np.mean(MAPEs),2))
    print('mean DA:', round(np.mean(DAs),4))

In [40]:
def ensemble_evaluator(bagging, ada_dt, ada_rf,  xgb, stack, stack_da, df_test, le, feature_importances, feature_importances_da):
    RMSEs, MAEs, MAPEs, DAs = [], [], [], []
    results = []
    for stock in target_stocks:
        x_tmp = df_test.loc[df_test['Ticker'] == le.transform([stock])[0]].copy()
        
        # Directional Accuracy
        changes = x_tmp[Horizon] - x_tmp['Close(t)']
        y_true_da = []
        for change in changes:
            y_true_da.append(1 if change >= 0 else 0)
        
        y_tmp = x_tmp[Horizon].values.reshape(-1,1)
        x_tmp = x_tmp.drop([Horizon], axis=1)        
        
        # Prediction
        y_pred = np.concatenate((
                                 bagging.predict(x_tmp).reshape(-1,1),
                                 ada_dt.predict(x_tmp).reshape(-1,1),
                                 ada_rf.predict(x_tmp).reshape(-1,1),
                                 xgb.predict(xgboost.DMatrix(x_tmp)).reshape(-1,1)), 
                                axis=1)
        
        df_pred = pd.DataFrame.from_records(y_pred).round(2)
        df_pred.columns = ['Bagging_DT', 'Ada_DT', 'Ada_RF', 'XGB']
        df_pred.head()

        # Directional Accuracy Pred
        close_t = np.reshape(x_tmp['Close(t)'].values, (-1, 1))
        y_changes = np.concatenate((
                                    np.array(y_pred[:,0]).reshape(-1,1)-close_t,
                                    np.array(y_pred[:,1]).reshape(-1,1)-close_t,
                                    np.array(y_pred[:,2]).reshape(-1,1)-close_t,
                                    np.array(y_pred[:,3]).reshape(-1,1)-close_t,
                                   ), axis=1)
        y_pred_da = []
        for row in y_changes:
            tmp_row = []
            for change in row:
                tmp_row.append(1 if change>=0 else 0)
            y_pred_da.append(tmp_row)

        df_pred_da = pd.DataFrame.from_records(y_pred_da)
        df_pred_da.columns = ['Bagging_DT', 'Ada_DT', 'Ada_RF', 'XGB']
        df_pred_da.head()
        
        
        
        df_pred['weight'] = (
                     df_pred['Bagging_DT']*feature_importances[0] + 
                     df_pred['Ada_DT']*feature_importances[1] + 
                     df_pred['Ada_RF']*feature_importances[2] + 
                     df_pred['XGB']*feature_importances[3]
        )
        
        df_pred_da['vote'] = (
                     df_pred_da['Bagging_DT']*feature_importances_da[0] + 
                     df_pred_da['Ada_DT']*feature_importances_da[1] + 
                     df_pred_da['Ada_RF']*feature_importances_da[2] + 
                     df_pred_da['XGB']*feature_importances_da[3]
        ).round(0).astype(int)

        y_pred = stack.predict(y_pred).reshape(-1,1)
        y_pred_da = stack_da.predict(y_pred_da).reshape(-1,1).round(0).astype(int)
#         y_pred = df_pred['weight'].values.reshape(-1,1)
#         y_pred_da = df_pred_da['vote'].values.reshape(-1,1)
        
        RMSE = np.sqrt(mean_squared_error(y_tmp, y_pred))
        MAE = mean_absolute_error(y_tmp, y_pred)
        MAPE = mean_absolute_percentage_error(y_tmp.reshape(-1,1), y_pred.reshape(-1,1))
        DA = accuracy_score(y_true_da, y_pred_da)
        print(stock, "\tRMSE: %.2f\t MAE: %.2f \tMAPE: %.2f \tDA: %.2f" % (RMSE, MAE, MAPE, DA))
        RMSEs.append(RMSE)
        MAEs.append(MAE)
        MAPEs.append(MAPE)
        DAs.append(DA)
    
    print('\nmean RMSE:', round(np.mean(RMSEs),2))
    print('mean MAE:', round(np.mean(MAEs),2))
    print('mean MAPE:', round(np.mean(MAPEs),2))
    print('mean DA:', round(np.mean(DAs),4))

# Ensemble

In [98]:
ensemble_evaluator( 
                   bagging,
                   adaboost_dt_regr, 
                   adaboost_rf_regr, 
                   xgb,
                   stack, stack_da,
                   x_test, le, feature_importances, feature_importances_da)

BANPU 	RMSE: 0.30	 MAE: 0.24 	MAPE: 1.28 	DA: 0.57
IRPC 	RMSE: 0.11	 MAE: 0.09 	MAPE: 1.49 	DA: 0.54
PTT 	RMSE: 8.34	 MAE: 6.72 	MAPE: 1.64 	DA: 0.47
BBL 	RMSE: 1.94	 MAE: 1.54 	MAPE: 0.80 	DA: 0.56
KBANK 	RMSE: 3.37	 MAE: 2.61 	MAPE: 1.23 	DA: 0.45
SCB 	RMSE: 1.91	 MAE: 1.41 	MAPE: 0.93 	DA: 0.56
AOT 	RMSE: 1.21	 MAE: 0.95 	MAPE: 1.53 	DA: 0.58
THAI 	RMSE: 0.29	 MAE: 0.24 	MAPE: 0.93 	DA: 1.00
CPF 	RMSE: 0.45	 MAE: 0.34 	MAPE: 1.31 	DA: 0.47
MINT 	RMSE: 0.50	 MAE: 0.42 	MAPE: 1.00 	DA: 0.88
TU 	RMSE: 0.29	 MAE: 0.22 	MAPE: 1.08 	DA: 0.57
SCC 	RMSE: 5.93	 MAE: 4.96 	MAPE: 1.01 	DA: 0.70
CPN 	RMSE: 1.73	 MAE: 1.29 	MAPE: 1.72 	DA: 0.59
CK 	RMSE: 0.41	 MAE: 0.33 	MAPE: 1.23 	DA: 0.57
CPALL 	RMSE: 1.59	 MAE: 1.27 	MAPE: 1.81 	DA: 0.57
HMPRO 	RMSE: 0.25	 MAE: 0.23 	MAPE: 2.12 	DA: 0.67
BDMS 	RMSE: 0.29	 MAE: 0.24 	MAPE: 1.15 	DA: 0.57
BH 	RMSE: 4.14	 MAE: 3.12 	MAPE: 1.54 	DA: 0.48
ADVANC 	RMSE: 2.72	 MAE: 2.03 	MAPE: 1.13 	DA: 0.46
JAS 	RMSE: 0.21	 MAE: 0.15 	MAPE: 1.85 	DA: 0.51
TRUE 	RM

# Linear Regression

In [57]:
# from sklearn import linear_model

# lineregr = linear_model.LinearRegression()
# lineregr.fit(x_train, y_train)

# evaluator(lineregr, x_test, le)

# Support Vector Regressor

In [58]:
# from sklearn.svm import SVR
# svr = SVR()
# svr.fit(x_train, y_train)

# evaluator(svr, x_test, le)

# Decistion Tree Regressor

In [43]:
from sklearn import tree

decis_tree_regr = tree.DecisionTreeRegressor(max_depth=None)
decis_tree_regr.fit(x_train, y_train.values.ravel())
evaluator(decis_tree_regr, x_test, le)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

BANPU 	RMSE: 0.68	 MAE: 0.47 	MAPE: 2.67 	DA: 0.65
IRPC 	RMSE: 0.18	 MAE: 0.13 	MAPE: 2.18 	DA: 0.31
PTT 	RMSE: 6.67	 MAE: 5.15 	MAPE: 1.23 	DA: 0.60
BBL 	RMSE: 3.14	 MAE: 2.21 	MAPE: 1.14 	DA: 0.48
KBANK 	RMSE: 4.54	 MAE: 3.30 	MAPE: 1.57 	DA: 0.45
SCB 	RMSE: 2.48	 MAE: 1.94 	MAPE: 1.28 	DA: 0.56
AOT 	RMSE: 1.43	 MAE: 1.15 	MAPE: 1.88 	DA: 0.53
THAI 	RMSE: 0.59	 MAE: 0.53 	MAPE: 2.17 	DA: 0.50
CPF 	RMSE: 0.68	 MAE: 0.49 	MAPE: 1.91 	DA: 0.47
MINT 	RMSE: 0.60	 MAE: 0.51 	MAPE: 1.22 	DA: 0.71
TU 	RMSE: 0.45	 MAE: 0.35 	MAPE: 1.74 	DA: 0.53
SCC 	RMSE: 11.18	 MAE: 10.10 	MAPE: 2.05 	DA: 0.55
CPN 	RMSE: 11.44	 MAE: 3.50 	MAPE: 4.41 	DA: 0.66
CK 	RMSE: 0.48	 MAE: 0.37 	MAPE: 1.38 	DA: 0.65
CPALL 	RMSE: 2.09	 MAE: 1.69 	MAPE: 2.39 	DA: 0.60
HMPRO 	RMSE: 0.57	 MAE: 0.43 	MAPE: 3.74 	DA: 1.00
BDMS 	RMSE: 0.40	 MAE: 0.27 	MAPE: 1.30 	DA: 0.62
BH 	RMSE: 4.24	 MAE: 3.22 	MAPE: 1.61 	DA: 0.52
ADVANC 	RMSE: 2.89	 MAE: 2.27 	MAPE: 1.26 	DA: 0.58
JAS 	RMSE: 0.23	 MAE: 0.19 	MAPE: 2.31 	DA: 0.56
TRUE 

# Random Forest Regrssor

In [44]:
from sklearn import ensemble

rnd_forest_regr = ensemble.RandomForestRegressor(n_jobs=-1)
rnd_forest_regr.fit(x_train, y_train.values.ravel())

evaluator(rnd_forest_regr, x_test, le)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

BANPU 	RMSE: 0.34	 MAE: 0.29 	MAPE: 1.62 	DA: 0.57
IRPC 	RMSE: 0.14	 MAE: 0.11 	MAPE: 1.81 	DA: 0.46
PTT 	RMSE: 7.06	 MAE: 5.29 	MAPE: 1.27 	DA: 0.47
BBL 	RMSE: 2.04	 MAE: 1.55 	MAPE: 0.81 	DA: 0.57
KBANK 	RMSE: 2.80	 MAE: 2.23 	MAPE: 1.06 	DA: 0.47
SCB 	RMSE: 1.75	 MAE: 1.36 	MAPE: 0.90 	DA: 0.52
AOT 	RMSE: 1.27	 MAE: 0.95 	MAPE: 1.54 	DA: 0.59
THAI 	RMSE: 0.18	 MAE: 0.16 	MAPE: 0.67 	DA: 1.00
CPF 	RMSE: 0.50	 MAE: 0.40 	MAPE: 1.57 	DA: 0.42
MINT 	RMSE: 0.43	 MAE: 0.38 	MAPE: 0.89 	DA: 0.59
TU 	RMSE: 0.29	 MAE: 0.21 	MAPE: 1.06 	DA: 0.58
SCC 	RMSE: 6.42	 MAE: 5.46 	MAPE: 1.11 	DA: 0.70
CPN 	RMSE: 1.48	 MAE: 1.19 	MAPE: 1.58 	DA: 0.38
CK 	RMSE: 0.35	 MAE: 0.26 	MAPE: 0.96 	DA: 0.63
CPALL 	RMSE: 1.63	 MAE: 1.26 	MAPE: 1.81 	DA: 0.47
HMPRO 	RMSE: 0.16	 MAE: 0.13 	MAPE: 1.10 	DA: 1.00
BDMS 	RMSE: 0.34	 MAE: 0.24 	MAPE: 1.13 	DA: 0.33
BH 	RMSE: 3.92	 MAE: 2.97 	MAPE: 1.47 	DA: 0.48
ADVANC 	RMSE: 2.19	 MAE: 1.86 	MAPE: 1.06 	DA: 0.42
JAS 	RMSE: 0.19	 MAE: 0.15 	MAPE: 1.82 	DA: 0.63
TRUE 	RM

# Bagging Regressor

In [45]:
bagging = ensemble.BaggingRegressor(base_estimator=None,
                                    n_estimators=30,n_jobs=-1)
bagging.fit(x_train, y_train.values.ravel())
evaluator(bagging, x_test, le)

BaggingRegressor(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=30, n_jobs=-1, oob_score=False, random_state=None,
         verbose=0, warm_start=False)

BANPU 	RMSE: 0.34	 MAE: 0.29 	MAPE: 1.62 	DA: 0.52
IRPC 	RMSE: 0.10	 MAE: 0.08 	MAPE: 1.27 	DA: 0.62
PTT 	RMSE: 7.93	 MAE: 5.40 	MAPE: 1.28 	DA: 0.47
BBL 	RMSE: 1.64	 MAE: 1.31 	MAPE: 0.68 	DA: 0.57
KBANK 	RMSE: 2.90	 MAE: 2.22 	MAPE: 1.05 	DA: 0.43
SCB 	RMSE: 1.72	 MAE: 1.32 	MAPE: 0.87 	DA: 0.51
AOT 	RMSE: 1.22	 MAE: 0.91 	MAPE: 1.48 	DA: 0.56
THAI 	RMSE: 0.30	 MAE: 0.30 	MAPE: 1.20 	DA: 1.00
CPF 	RMSE: 0.51	 MAE: 0.40 	MAPE: 1.58 	DA: 0.42
MINT 	RMSE: 0.40	 MAE: 0.31 	MAPE: 0.74 	DA: 0.65
TU 	RMSE: 0.28	 MAE: 0.21 	MAPE: 1.05 	DA: 0.57
SCC 	RMSE: 6.88	 MAE: 5.80 	MAPE: 1.18 	DA: 0.55
CPN 	RMSE: 1.64	 MAE: 1.26 	MAPE: 1.65 	DA: 0.45
CK 	RMSE: 0.36	 MAE: 0.26 	MAPE: 0.97 	DA: 0.63
CPALL 	RMSE: 1.49	 MAE: 1.21 	MAPE: 1.71 	DA: 0.53
HMPRO 	RMSE: 0.13	 MAE: 0.12 	MAPE: 1.16 	DA: 0.67
BDMS 	RMSE: 0.33	 MAE: 0.24 	MAPE: 1.16 	DA: 0.57
BH 	RMSE: 4.15	 MAE: 3.05 	MAPE: 1.51 	DA: 0.45
ADVANC 	RMSE: 2.35	 MAE: 1.96 	MAPE: 1.10 	DA: 0.38
JAS 	RMSE: 0.20	 MAE: 0.15 	MAPE: 1.81 	DA: 0.56
TRUE 	RM

# AdaBoost Regressor

In [59]:
adaboost_dt_regr = ensemble.AdaBoostRegressor(base_estimator=tree.DecisionTreeRegressor(),
                                           learning_rate=1, 
                                           n_estimators=10, 
                                           loss='linear')
adaboost_dt_regr.fit(x_train, y_train.values.ravel())

evaluator(adaboost_dt_regr, x_test, le)

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
         learning_rate=1, loss='linear', n_estimators=10,
         random_state=None)

BANPU 	RMSE: 0.34	 MAE: 0.28 	MAPE: 1.53 	DA: 0.57
IRPC 	RMSE: 0.15	 MAE: 0.12 	MAPE: 1.95 	DA: 0.54
PTT 	RMSE: 7.34	 MAE: 5.76 	MAPE: 1.40 	DA: 0.47
BBL 	RMSE: 2.00	 MAE: 1.58 	MAPE: 0.82 	DA: 0.56
KBANK 	RMSE: 3.64	 MAE: 2.82 	MAPE: 1.34 	DA: 0.45
SCB 	RMSE: 1.86	 MAE: 1.36 	MAPE: 0.90 	DA: 0.56
AOT 	RMSE: 1.28	 MAE: 0.96 	MAPE: 1.55 	DA: 0.58
THAI 	RMSE: 0.36	 MAE: 0.30 	MAPE: 1.14 	DA: 1.00
CPF 	RMSE: 0.47	 MAE: 0.34 	MAPE: 1.34 	DA: 0.47
MINT 	RMSE: 0.42	 MAE: 0.34 	MAPE: 0.80 	DA: 0.88
TU 	RMSE: 0.29	 MAE: 0.23 	MAPE: 1.13 	DA: 0.57
SCC 	RMSE: 8.56	 MAE: 7.60 	MAPE: 1.54 	DA: 0.70
CPN 	RMSE: 1.77	 MAE: 1.31 	MAPE: 1.74 	DA: 0.59
CK 	RMSE: 0.40	 MAE: 0.31 	MAPE: 1.15 	DA: 0.57
CPALL 	RMSE: 1.62	 MAE: 1.27 	MAPE: 1.81 	DA: 0.57
HMPRO 	RMSE: 0.32	 MAE: 0.28 	MAPE: 2.52 	DA: 0.67
BDMS 	RMSE: 0.29	 MAE: 0.22 	MAPE: 1.08 	DA: 0.57
BH 	RMSE: 4.13	 MAE: 3.10 	MAPE: 1.53 	DA: 0.48
ADVANC 	RMSE: 2.74	 MAE: 2.06 	MAPE: 1.15 	DA: 0.46
JAS 	RMSE: 0.22	 MAE: 0.17 	MAPE: 2.01 	DA: 0.51
TRUE 	RM

In [60]:
adaboost_rf_regr = ensemble.AdaBoostRegressor(base_estimator=ensemble.RandomForestRegressor(n_jobs=-1),
                                           learning_rate=1, 
                                           n_estimators=10, 
                                           loss='linear')
adaboost_rf_regr.fit(x_train, y_train.values.ravel())

evaluator(adaboost_rf_regr, x_test, le)

AdaBoostRegressor(base_estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
         learning_rate=1, loss='linear', n_estimators=10,
         random_state=None)

BANPU 	RMSE: 0.32	 MAE: 0.28 	MAPE: 1.53 	DA: 0.52
IRPC 	RMSE: 0.12	 MAE: 0.10 	MAPE: 1.65 	DA: 0.62
PTT 	RMSE: 7.62	 MAE: 5.51 	MAPE: 1.31 	DA: 0.43
BBL 	RMSE: 1.76	 MAE: 1.41 	MAPE: 0.73 	DA: 0.52
KBANK 	RMSE: 2.87	 MAE: 2.21 	MAPE: 1.05 	DA: 0.41
SCB 	RMSE: 1.74	 MAE: 1.37 	MAPE: 0.91 	DA: 0.44
AOT 	RMSE: 1.17	 MAE: 0.85 	MAPE: 1.38 	DA: 0.59
THAI 	RMSE: 0.25	 MAE: 0.25 	MAPE: 1.01 	DA: 1.00
CPF 	RMSE: 0.48	 MAE: 0.37 	MAPE: 1.47 	DA: 0.56
MINT 	RMSE: 0.47	 MAE: 0.40 	MAPE: 0.94 	DA: 0.59
TU 	RMSE: 0.28	 MAE: 0.21 	MAPE: 1.05 	DA: 0.54
SCC 	RMSE: 5.58	 MAE: 4.44 	MAPE: 0.90 	DA: 0.45
CPN 	RMSE: 1.46	 MAE: 1.11 	MAPE: 1.47 	DA: 0.62
CK 	RMSE: 0.35	 MAE: 0.25 	MAPE: 0.91 	DA: 0.68
CPALL 	RMSE: 1.42	 MAE: 1.16 	MAPE: 1.66 	DA: 0.43
HMPRO 	RMSE: 0.18	 MAE: 0.15 	MAPE: 1.28 	DA: 0.67
BDMS 	RMSE: 0.32	 MAE: 0.24 	MAPE: 1.14 	DA: 0.38
BH 	RMSE: 4.31	 MAE: 3.07 	MAPE: 1.51 	DA: 0.48
ADVANC 	RMSE: 2.26	 MAE: 1.83 	MAPE: 1.03 	DA: 0.42
JAS 	RMSE: 0.24	 MAE: 0.18 	MAPE: 2.20 	DA: 0.59
TRUE 	RM

# Gradient Boosting Regressor

In [48]:
gbr = ensemble.GradientBoostingRegressor(n_estimators=100, 
                                         learning_rate=0.1,
                                         max_depth=6,
                                         min_samples_split=2,
                                         loss='ls',
                                        )
gbr.fit(x_train, y_train.values.ravel())

evaluator(gbr, x_test, le)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=6, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

BANPU 	RMSE: 0.56	 MAE: 0.39 	MAPE: 2.12 	DA: 0.52
IRPC 	RMSE: 0.14	 MAE: 0.10 	MAPE: 1.71 	DA: 0.62
PTT 	RMSE: 7.59	 MAE: 5.49 	MAPE: 1.30 	DA: 0.46
BBL 	RMSE: 1.77	 MAE: 1.41 	MAPE: 0.73 	DA: 0.60
KBANK 	RMSE: 2.93	 MAE: 2.16 	MAPE: 1.03 	DA: 0.45
SCB 	RMSE: 1.63	 MAE: 1.26 	MAPE: 0.84 	DA: 0.52
AOT 	RMSE: 1.23	 MAE: 0.94 	MAPE: 1.51 	DA: 0.59
THAI 	RMSE: 0.18	 MAE: 0.16 	MAPE: 0.67 	DA: 1.00
CPF 	RMSE: 0.52	 MAE: 0.41 	MAPE: 1.62 	DA: 0.42
MINT 	RMSE: 0.46	 MAE: 0.37 	MAPE: 0.88 	DA: 0.71
TU 	RMSE: 0.29	 MAE: 0.22 	MAPE: 1.10 	DA: 0.56
SCC 	RMSE: 5.62	 MAE: 4.69 	MAPE: 0.95 	DA: 0.60
CPN 	RMSE: 1.83	 MAE: 1.22 	MAPE: 1.58 	DA: 0.48
CK 	RMSE: 0.38	 MAE: 0.27 	MAPE: 1.01 	DA: 0.68
CPALL 	RMSE: 1.22	 MAE: 0.98 	MAPE: 1.42 	DA: 0.50
HMPRO 	RMSE: 0.18	 MAE: 0.12 	MAPE: 0.96 	DA: 1.00
BDMS 	RMSE: 0.27	 MAE: 0.22 	MAPE: 1.04 	DA: 0.57
BH 	RMSE: 4.02	 MAE: 2.88 	MAPE: 1.41 	DA: 0.55
ADVANC 	RMSE: 3.93	 MAE: 2.64 	MAPE: 1.44 	DA: 0.38
JAS 	RMSE: 0.20	 MAE: 0.16 	MAPE: 2.01 	DA: 0.46
TRUE 	RM

# XGBoost Regressor

In [62]:
import xgboost
from sklearn.model_selection import train_test_split

d_train, d_valid, y_d_train, y_d_valid = train_test_split(x_train, y_train, test_size=0.1, random_state=10)
len(d_train), len(d_valid)

d_train = xgboost.DMatrix(d_train, label=y_d_train)
d_valid = xgboost.DMatrix(d_valid, label=y_d_valid)

(3878, 431)

Parameters
http://xgboost.readthedocs.io/en/latest//parameter.html

In [63]:
params = {
    'booster':'dart',
    'max_depth': 4,
    'learning_rate': 0.05,
    'subsample': 1,
    'objective': 'reg:tweedie',
    'eval_metric': 'mae',
    'reg_lambda': 0.8,
    'reg_alpha': 0.2,
    'silent': 1,
    'sample_type':"weighted"
}

xgb = xgboost.train(params, d_train, 
                    num_boost_round=5000, 
                    evals=[(d_train, 'train'), (d_valid, 'valid')], 
                    early_stopping_rounds=50,
                    verbose_eval=100
                   )

[0]	train-mae:111.315	valid-mae:112.966
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 50 rounds.
[100]	train-mae:7.31763	valid-mae:7.29744
[200]	train-mae:1.12561	valid-mae:1.53498
Stopping. Best iteration:
[226]	train-mae:1.10719	valid-mae:1.53211



In [64]:
evaluator(xgb, x_test, le, isXGB=True)

BANPU 	RMSE: 0.30	 MAE: 0.27 	MAPE: 1.48 	DA: 0.57
IRPC 	RMSE: 0.10	 MAE: 0.09 	MAPE: 1.50 	DA: 0.77
PTT 	RMSE: 10.26	 MAE: 6.45 	MAPE: 1.54 	DA: 0.49
BBL 	RMSE: 1.72	 MAE: 1.36 	MAPE: 0.71 	DA: 0.62
KBANK 	RMSE: 3.14	 MAE: 2.37 	MAPE: 1.12 	DA: 0.45
SCB 	RMSE: 1.68	 MAE: 1.28 	MAPE: 0.85 	DA: 0.54
AOT 	RMSE: 1.21	 MAE: 0.93 	MAPE: 1.50 	DA: 0.48
THAI 	RMSE: 0.18	 MAE: 0.18 	MAPE: 0.69 	DA: 1.00
CPF 	RMSE: 0.52	 MAE: 0.44 	MAPE: 1.74 	DA: 0.42
MINT 	RMSE: 0.46	 MAE: 0.37 	MAPE: 0.90 	DA: 0.71
TU 	RMSE: 0.27	 MAE: 0.20 	MAPE: 1.00 	DA: 0.55
SCC 	RMSE: 6.63	 MAE: 5.57 	MAPE: 1.13 	DA: 0.65
CPN 	RMSE: 1.80	 MAE: 1.37 	MAPE: 1.77 	DA: 0.52
CK 	RMSE: 0.36	 MAE: 0.27 	MAPE: 0.98 	DA: 0.60
CPALL 	RMSE: 1.43	 MAE: 1.08 	MAPE: 1.58 	DA: 0.47
HMPRO 	RMSE: 0.10	 MAE: 0.09 	MAPE: 0.85 	DA: 1.00
BDMS 	RMSE: 0.23	 MAE: 0.18 	MAPE: 0.87 	DA: 0.62
BH 	RMSE: 3.63	 MAE: 2.86 	MAPE: 1.41 	DA: 0.55
ADVANC 	RMSE: 3.62	 MAE: 2.54 	MAPE: 1.39 	DA: 0.42
JAS 	RMSE: 0.23	 MAE: 0.19 	MAPE: 2.24 	DA: 0.51
TRUE 	R

# Save ML models

In [65]:
import pickle
pickle.dump(decis_tree_regr, open('models/decis_tree_regr_twitter_all.pkl', 'wb'))
pickle.dump(rnd_forest_regr, open('models/rnd_forest_regr_twitter_all.pkl', 'wb'))
pickle.dump(bagging, open('models/bagging_regr_twitter_all.pkl', 'wb'))
pickle.dump(adaboost_dt_regr, open('models/adaboost_dt_regr_twitter_all.pkl', 'wb'))
pickle.dump(adaboost_rf_regr, open('models/adaboost_rf_regr_twitter_all.pkl', 'wb'))
pickle.dump(xgb, open('models/xgb_twitter_all.pkl', 'wb'))

# Ensemble Stacking

In [66]:
dt = pickle.load(open('models/decis_tree_regr_twitter_all.pkl', 'rb'))
rf = pickle.load(open('models/rnd_forest_regr_twitter_all.pkl', 'rb'))
bagging = pickle.load(open('models/bagging_regr_twitter_all.pkl', 'rb'))
ada_dt = pickle.load(open('models/adaboost_dt_regr_twitter_all.pkl', 'rb'))
ada_rf = pickle.load(open('models/adaboost_rf_regr_twitter_all.pkl', 'rb'))
xgb = pickle.load(open('models/xgb_twitter_all.pkl', 'rb'))

In [89]:
x_train_stack = np.concatenate((
                         bagging.predict(x_train).reshape(-1,1),
                         ada_dt.predict(x_train).reshape(-1,1),
                         ada_rf.predict(x_train).reshape(-1,1),
                         xgb.predict(xgboost.DMatrix(x_train)).reshape(-1,1)), axis=1)

In [90]:
y_test = x_test[Horizon].values.reshape(-1,1)

x_test_stack = np.concatenate((
                         bagging.predict(x_test.drop(['Close(t+1)'], axis=1)).reshape(-1,1),
                         ada_dt.predict(x_test.drop(['Close(t+1)'], axis=1)).reshape(-1,1),
                         ada_rf.predict(x_test.drop(['Close(t+1)'], axis=1)).reshape(-1,1),
                         xgb.predict(xgboost.DMatrix((x_test.drop(['Close(t+1)'], axis=1)))).reshape(-1,1)), axis=1)

In [92]:
stack = ensemble.RandomForestRegressor(n_jobs=-1)
stack.fit(x_train_stack, y_train.values.ravel())

y_pred_stack = stack.predict(x_test_stack).reshape(-1,1)

for i in range(x_test_stack.shape[1]):
    RMSE = np.sqrt(mean_squared_error(y_test, x_test_stack[:,i]))
    MAE = mean_absolute_error(y_test, x_test_stack[:,i])
    MAPE = mean_absolute_percentage_error(y_test, x_test_stack[:,i].reshape(-1,1))
    print("RMSE: %.2f \tMAE: %.2f \tMAPE: %.2f" % (RMSE, MAE, MAPE))
    
RMSE = np.sqrt(mean_squared_error(y_test, y_pred_stack))
MAE = mean_absolute_error(y_test, y_pred_stack)
MAPE = mean_absolute_percentage_error(y_test, y_pred_stack)
print("\nRMSE: %.2f \tMAE: %.2f \tMAPE: %.2f" % (RMSE, MAE, MAPE))

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

RMSE: 3.26 	MAE: 1.04 	MAPE: 1.00
RMSE: 2.81 	MAE: 1.38 	MAPE: 1.33
RMSE: 2.66 	MAE: 1.24 	MAPE: 1.22
RMSE: 3.35 	MAE: 1.35 	MAPE: 1.28

RMSE: 2.45 	MAE: 1.25 	MAPE: 1.22


In [93]:
feature_importances = stack.feature_importances_
feature_importances

array([0.06368828, 0.51748973, 0.38883383, 0.02998816])

# Stack DA

In [94]:
close_t = np.reshape(x_train['Close(t)'].values, (-1, 1))
changes = y_train.values.reshape(-1,1) - close_t
y_train_da = np.array([1 if change >= 0 else 0 for change in changes]).reshape(-1,1)

x_changes = np.concatenate((
                            np.array(x_train_stack[:,0]).reshape(-1,1)-close_t,
                            np.array(x_train_stack[:,1]).reshape(-1,1)-close_t,
                            np.array(x_train_stack[:,2]).reshape(-1,1)-close_t,
                            np.array(x_train_stack[:,3]).reshape(-1,1)-close_t,
                           ), axis=1)
x_train_stack_da = []
for row in x_changes:
    tmp_row = []
    for change in row:
        tmp_row.append(1 if change>=0 else 0)
    x_train_stack_da.append(tmp_row)

In [95]:
close_t = np.reshape(x_test['Close(t)'].values, (-1, 1))
changes = y_test - close_t
y_test_da = np.array([1 if change >= 0 else 0 for change in changes]).reshape(-1,1)

x_changes = np.concatenate((
                            np.array(x_test_stack[:,0]).reshape(-1,1)-close_t,
                            np.array(x_test_stack[:,1]).reshape(-1,1)-close_t,
                            np.array(x_test_stack[:,2]).reshape(-1,1)-close_t,
                            np.array(x_test_stack[:,3]).reshape(-1,1)-close_t,
                           ), axis=1)
x_test_stack_da = []
for row in x_changes:
    tmp_row = []
    for change in row:
        tmp_row.append(1 if change>=0 else 0)
    x_test_stack_da.append(tmp_row)

x_test_stack_da = np.array(x_test_stack_da)

In [96]:
stack_da = ensemble.GradientBoostingRegressor()
stack_da.fit(x_train_stack_da, y_train_da.ravel())

y_stack_da = stack_da.predict(x_test_stack_da).reshape(-1,1).round(0).astype(int)

for i in range(x_test_stack_da.shape[1]):
    acc = accuracy_score(y_test_da, x_test_stack_da[:,i])
    print("Accuracy: %.4f" % (acc))
    
acc = accuracy_score(y_test_da, y_stack_da)
print("\nAccuracy: %.4f" % (acc))

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

Accuracy: 0.5755
Accuracy: 0.5552
Accuracy: 0.5249
Accuracy: 0.5497

Accuracy: 0.5552


In [97]:
feature_importances_da = stack_da.feature_importances_
feature_importances_da

array([0.28697064, 0.52419033, 0.01115664, 0.17768239])

# LSTM

In [16]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM

Using TensorFlow backend.


In [17]:
train_X = x_train.values
val_X = x_valid.values
# test_X = x_test.values

train_y = y_train.values
val_y = y_valid.values
# test_y = y_test.values

train_X = train_X.reshape(train_X.shape[0], train_X.shape[1], 1)
val_X = val_X.reshape(val_X.shape[0], val_X.shape[1], 1)
# test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))

print(train_X.shape, train_y.shape)
print(val_X.shape, val_y.shape)
# print(test_X.shape, test_y.shape)

(17396, 1017, 1) (17396, 1)
(3070, 1017, 1) (3070, 1)


In [18]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, input_shape=(train_X.shape[1], 1)))
model.add(Dropout(0.3))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.3))
# model.add(LSTM(256, return_sequences=True))
# model.add(Dropout(0.3))
# model.add(LSTM(256, return_sequences=True))
# model.add(Dropout(0.4))
model.add(LSTM(128))
model.add(Dense(1))
model.compile(loss='mean_absolute_error', optimizer='adam')

In [19]:
from keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping

checkpoint = ModelCheckpoint(filepath="model/LSTM.h5", 
                             monitor='val_loss', 
                             verbose=1, 
                             save_best_only=True, 
                             save_weights_only=False, 
                             mode='auto', 
                             period=1
                            )

earlystopping = EarlyStopping(monitor='val_loss', 
                      min_delta=0, 
                      patience=100,
                      verbose=1, 
                      mode='auto')

# access via $ tensorboard --logdir=./logs
tensorboard = TensorBoard(log_dir='./logs')

In [None]:
model.fit(x=train_X, 
          y=train_y,
          epochs=1000,
          batch_size=32,
          validation_data=(val_X, val_y),
          verbose=1,
          shuffle=False,
          callbacks=[checkpoint, earlystopping, tensorboard]
         )

In [None]:
evaluator(model, x_test, le, isLSTM=True)

In [None]:
from keras.layers import Bidirectional

In [None]:
model = Sequential()
model.add(Bidirectional(LSTM(128, return_sequences=True), input_shape=(2, train_X.shape[2])))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.4))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(1))
model.compile(loss='mean_absolute_error', optimizer='adam')

In [None]:
model.fit(x=train_X, 
          y=train_y,
          epochs=10000,
          batch_size=1024,
          validation_data=(val_X, val_y),
          verbose=1,
          shuffle=False,
          callbacks=[checkpoint, earlystopping, tensorboard]
         )

In [None]:
'hey'

In [None]:
evaluator(model, x_test, le, isLSTM=True)

In [None]:
data_dim = 16
timesteps = 8
num_classes = 10
np.random.random((10, timesteps, data_dim)).shape

In [None]:
train_X = x_train.values
val_X = x_valid.values
train_X.shape

train_X = train_X.reshape(train_X.shape[0], train_X.shape[1], 1)
train_X.shape