# Regression with news (all) with price

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [3]:
import numpy as np
import pandas as pd
import os
import copy
import sklearn
import xgboost
import plotly.graph_objs as go
import matplotlib.pyplot as plt

from pythainlp.corpus import stopwords
from datetime import datetime, timedelta
from tqdm import tqdm_notebook
from sklearn import tree, ensemble, svm, grid_search
from copy import deepcopy
from scipy.stats.stats import pearsonr 

% matplotlib inline

target_stocks = ['BANPU','IRPC','PTT','BBL','KBANK','SCB','AOT','THAI','CPF','MINT',
                 'TU','SCC','CPN','CK','CPALL','HMPRO','BDMS','BH','ADVANC','JAS','TRUE']

In [5]:
df_price = pd.read_csv('merged_2013_2018.csv')
df_price['Date'] = pd.to_datetime(df_price['Date'], format='%Y-%m-%d')
df_price = df_price.loc[df_price['Ticker'].isin(target_stocks)]
df_price['Date'] = df_price['Date'].dt.date
df_price = df_price.set_index('Date')
df_price.tail(1)
len(df_price)

df_kaohoon = pd.read_csv('data/kaohoon_all.csv')
df_kaohoon['Date'] = pd.to_datetime(df_kaohoon['Date'], format='%Y-%m-%d')
df_kaohoon = df_kaohoon.set_index('Date')
df_kaohoon = df_kaohoon[:'2018-2-8']
df_kaohoon.index = df_kaohoon.index.date
df_kaohoon.tail(1)
len(df_kaohoon)

df_moneych = pd.read_csv('data/moneychanel_all.csv')
df_moneych['Date'] = pd.to_datetime(df_moneych['Date'], format='%Y-%m-%d')
df_moneych = df_moneych.set_index('Date')
df_moneych = df_moneych[:'2018-2-8']
df_moneych.index = df_moneych.index.date
df_moneych.tail(1)
len(df_moneych)

df_news = pd.concat([df_moneych, df_kaohoon])
'Total:', len(df_news.index)

Unnamed: 0_level_0,Ticker,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-02-16,TU,20.0,20.5,20.0,20.1,7299300


26331

Unnamed: 0,Ticker,Text
2018-02-08,PTT,ผู้สื่อข่าว รายงาน ว่า ดัชนี ปิด ตลาด วันนี้ ท...


118355

Unnamed: 0,Ticker,Text
2018-02-08,IRPC,เปิด ตลาด ภาค เช้า พฤหัสฯ กพ วอ ลุ่ม ขาย มุ่ง ...


5084

('Total:', 123439)

# Lag & Horizon Construction

In [None]:
N_lags = 3
N_horizon = 1

df_train = []
df_test = []
df_val = []
for stock in tqdm_notebook(target_stocks):
    news_stocks = []
    df_stock = df_news.loc[df_news['Ticker'] == stock]
    prev_date = None
    prev_text = None
    
    pbar = tqdm_notebook(total=len(df_stock))
    for date, row in df_stock.iterrows():
        if prev_date == None:
            prev_date = date
            prev_text = row['Text']
        elif prev_date != date:
            # horizon
            tmp_date = copy.deepcopy(prev_date)
            tmp_date += timedelta(days=1)
            prices = []
            count_lags = 0 
            while count_lags < N_horizon:
                price = df_price.loc[(df_price.index == tmp_date) & (df_price['Ticker'] == stock)].values
                tmp_date += timedelta(days=1)
                if len(price) == 0: continue
                prices.append(price[0][4]) # Close price next day(s)
                count_lags+=1
            
            # lag
            tmp_date = copy.deepcopy(prev_date)
            count_lags = 0 
            while count_lags < N_lags:
                price = df_price.loc[(df_price.index == tmp_date) & (df_price['Ticker'] == stock)].values
                tmp_date -= timedelta(days=1)
                if len(price) == 0: continue
                for val in price[0][:-1]: 
                    if type(val) != str: prices.append(val)
                count_lags+=1

            news_stocks.append([prev_date, stock, prev_text] + prices)
            
            prev_date = date
            prev_text = row['Text']
        elif prev_date == date:
            prev_text += ' '+row['Text']
        
        pbar.update(1)
    pbar.close()

    news_stocks = pd.DataFrame.from_records(news_stocks)
    news_stocks.columns = ['Date', 'Ticker', 'Text', 
                           'Close(t+1)', # 'Close(t+2)','Close(t+3)','Close(t+4)','Close(t+5)',
                           'Open(t)', 'High(t)', 'Low(t)', 'Close(t)',
                           'Open(t-1)', 'High(t-1)', 'Low(t-1)', 'Close(t-1)',
                           'Open(t-2)', 'High(t-2)', 'Low(t-2)', 'Close(t-2)',
#                            'Open(t-3)', 'High(t-3)', 'Low(t-3)', 'Close(t-3)',
#                            'Open(t-4)', 'High(t-4)', 'Low(t-4)', 'Close(t-4)',
#                            'Open(t-5)', 'High(t-5)', 'Low(t-5)', 'Close(t-5)'
                          ]
    news_stocks = news_stocks.set_index('Date')
    
    train_size = int(len(news_stocks) * 0.80)
    test_size = int(len(news_stocks) * 0.10)
    
    train = news_stocks.iloc[:train_size]
    test = news_stocks.iloc[train_size: train_size + test_size]
    val = news_stocks.iloc[train_size + test_size: ]

    print(stock, ':\t',len(train), len(test), len(val))    
    df_train.append(train)
    df_test.append(test)
    df_val.append(val)


df_train = pd.concat(df_train, axis=0)
df_test = pd.concat(df_test, axis=0)
df_val = pd.concat(df_val, axis=0)

len(df_train), len(df_test), len(df_val) 
df_train.head(1)
df_test.head(1)
df_val.head(1)

In [4]:
# df_train.to_csv('data/df_train_news.csv')
# df_test.to_csv('data/df_test_news.csv')
# df_val.to_csv('data/df_val_news.csv')

In [5]:
df_train = pd.read_csv('data/df_train_news.csv')
df_train['Date'] = pd.to_datetime(df_train['Date'], format='%Y-%m-%d')
df_train = df_train.set_index('Date')

df_test = pd.read_csv('data/df_test_news.csv')
df_test['Date'] = pd.to_datetime(df_test['Date'], format='%Y-%m-%d')
df_test = df_test.set_index('Date')

df_val = pd.read_csv('data/df_val_news.csv')
df_val['Date'] = pd.to_datetime(df_val['Date'], format='%Y-%m-%d')
df_val = df_val.set_index('Date')

In [5]:
# df_train = df_train[df_train['Ticker'] != 'BANPU']
# df_train = df_train[df_train['Ticker'] != 'IRPC']
# df_train = df_train[df_train['Ticker'] != 'BBL']
# df_train = df_train[df_train['Ticker'] != 'KBANK']
# df_train = df_train[df_train['Ticker'] != 'THAI']
# df_train = df_train[df_train['Ticker'] != 'MINT']
# df_train = df_train[df_train['Ticker'] != 'TU']
# df_train = df_train[df_train['Ticker'] != 'CPN']
# df_train = df_train[df_train['Ticker'] != 'CPF']
# df_train = df_train[df_train['Ticker'] != 'HMPRO']
# df_train = df_train[df_train['Ticker'] != 'JAS']
# df_train = df_train[df_train['Ticker'] != 'TRUE']

# target_stocks = ['PTT','SCB','AOT','SCC','CK','CPALL','BH','ADVANC']

In [6]:
df_train = df_train[['Ticker', 'Text', 'Close(t+1)', 'Close(t)', 'Close(t-1)', 'Close(t-2)']]
df_test = df_test[['Ticker', 'Text', 'Close(t+1)', 'Close(t)', 'Close(t-1)', 'Close(t-2)']]
df_val = df_val[['Ticker', 'Text', 'Close(t+1)', 'Close(t)', 'Close(t-1)', 'Close(t-2)']]

In [7]:
df_train.head(1)
df_test.head(1)
df_val.head(1)

Unnamed: 0_level_0,Ticker,Text,Close(t+1),Close(t),Close(t-1),Close(t-2)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-02-03,BANPU,โบ รก ชี้ แรง ซื้อ หุ้น น้ำมัน หนุน ดัชนี ขึ้น...,26.25,25.5,24.3,24.3


Unnamed: 0_level_0,Ticker,Text,Close(t+1),Close(t),Close(t-1),Close(t-2)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-06-13,BANPU,หลักทรัพย์ ปริมาณ หุ้น ที่ ขาย ซอ ร์ต หุ้น มูล...,18.1,17.9,17.6,17.5


Unnamed: 0_level_0,Ticker,Text,Close(t+1),Close(t),Close(t-1),Close(t-2)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-10-07,BANPU,หลักทรัพย์ ปริมาณ หุ้น ที่ ขาย ซอ ร์ต หุ้น มูล...,17.4,17.5,17.6,17.5


In [8]:
# df_train = df_train[df_train['Ticker'] == 'BANPU']
# df_test = df_test[df_test['Ticker'] == 'BANPU']
# df_val = df_val[df_val['Ticker'] == 'BANPU']

In [9]:
df_train.shape
df_test.shape
df_val.shape

(20466, 6)

(2549, 6)

(2577, 6)

In [55]:
# replace Text with TF-IDF vector
x_train = df_train.drop(['Text'], axis=1)
x_test = df_test.drop(['Text'], axis=1)
x_val = df_val.drop(['Text'], axis=1)

# Label Encoding
le = sklearn.preprocessing.LabelEncoder()
x_train['Ticker'] = le.fit_transform(x_train['Ticker'])
x_test['Ticker'] = le.transform(x_test['Ticker'])
x_val['Ticker'] = le.transform(x_val['Ticker'])

x_train.head(1)
# x_test.head(2)
# x_val.head(2)
print(", ".join(le.classes_))

Unnamed: 0_level_0,Ticker,Close(t+1),Close(t),Close(t-1),Close(t-2)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-02-03,2,26.25,25.5,24.3,24.3


ADVANC, AOT, BANPU, BBL, BDMS, BH, CK, CPALL, CPF, CPN, HMPRO, IRPC, JAS, KBANK, MINT, PTT, SCB, SCC, THAI, TRUE, TU


# Create x_train and y_train

In [54]:
Horizon = 'Close(t+1)'
y_train = x_train[[Horizon]]
x_train = x_train.drop(['Close(t+1)'], axis=1).copy()
x_train.shape, y_train.shape

y_val = x_val[[Horizon]]
x_val = x_val.drop(['Close(t+1)'], axis=1).copy()
x_val.shape, y_val.shape

((20466, 4), (20466, 1))

((2577, 4), (2577, 1))

# Evaluate Each Stcok

In [12]:
def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [13]:
def evaluator(clf, df_test, le, isXGB=False, isLSTM=False):
    RMSEs, MAEs, MAPEs, R2s, DAs = [], [], [], [], []
    results = []
    for stock in target_stocks:
        x_tmp = df_test.loc[df_test['Ticker'] == le.transform([stock])[0]].copy()
        
        y_tmp = x_tmp[Horizon].values
        
        # Directional Accuracy
        changes = x_tmp[Horizon] -  x_tmp['Close(t)']
        y_true_da = []
        for change in changes:
            y_true_da.append(1 if change >= 0 else 0)
                
        x_tmp = x_tmp.drop(['Close(t+1)'], axis=1)
        
        if isXGB:
            y_pred = clf.predict(xgboost.DMatrix(x_tmp))
        elif isLSTM:
            x = x_tmp.values
            x = x.reshape((x.shape[0], x.shape[1], 1))
            y_pred = clf.predict(x)
        else:
            y_pred = clf.predict(x_tmp.as_matrix())
        
#         # Directional Accuracy Pred
        changes = y_pred.reshape(-1,1) -  x_tmp['Close(t)'].values.reshape(-1,1)
        y_pred_da = []
        for change in changes:
            y_pred_da.append(1 if change >= 0 else 0)
        
        RMSE = np.sqrt(sklearn.metrics.mean_squared_error(y_tmp, y_pred))
        MAE = sklearn.metrics.mean_absolute_error(y_tmp, y_pred)
        MAPE = mean_absolute_percentage_error(y_tmp, np.squeeze(y_pred))
#         R2 = sklearn.metrics.r2_score(y_tmp, y_pred)
        DA = sklearn.metrics.accuracy_score(y_true_da, y_pred_da)
        print(stock, "\tRMSE: %.2f\t MAE: %.2f \tMAPE: %.2f \tDA: %.2f" % (RMSE, MAE, MAPE, DA))
        RMSEs.append(RMSE)
        MAEs.append(MAE)
        MAPEs.append(MAPE)
#         R2s.append(R2)
        DAs.append(DA)
    
    print('\nmean RMSE:', round(np.mean(RMSEs),2))
    print('mean MAE:', round(np.mean(MAEs),2))
    print('mean MAPE:', round(np.mean(MAPEs),2))
#     print('mean R2:', round(np.mean(R2s),2))
    print('mean DA:', round(np.mean(DAs),4))

In [14]:
def ensemble_evaluator(df_test):
    RMSEs, MAEs, MAPEs, DAs = [], [], [], []
    results = []
    for stock in target_stocks:
        x_tmp = df_test.loc[df_test['Ticker'] == le.transform([stock])[0]].copy()
        
        y_tmp = x_tmp[Horizon].values
        
        # Directional Accuracy
        changes = x_tmp[Horizon] -  x_tmp['Close(t)']
        y_true_da = []
        for change in changes:
            y_true_da.append(1 if change >= 0 else 0)
                
        x_tmp = x_tmp.drop(['Close(t+1)'], axis=1)
        
        # Prediction
        
        y_dt = dt.predict(x_tmp.as_matrix())
        y_rf = rf.predict(x_tmp.as_matrix())
        y_ada = ada.predict(x_tmp.as_matrix())
        y_gb = gb.predict(x_tmp.as_matrix())
        y_xgb = xgb.predict(xgboost.DMatrix(x_tmp))
        y_knn = knn.predict(x_tmp.as_matrix())
        y_bay = bay.predict(x_tmp.as_matrix())

#         y_pred = (y_dt+y_rf+y_ada+y_gb+y_xgb+y_knn+y_bay)/7
        
        y_all = np.concatenate((
            y_dt.reshape(-1,1),
            y_rf.reshape(-1,1),
            y_ada.reshape(-1,1),
            y_gb.reshape(-1,1),
            y_xgb.reshape(-1,1),
            y_knn.reshape(-1,1),
            y_bay.reshape(-1,1)
        ), axis=1)
        y_pred = np.zeros((y_all.shape[0],1))
        for i in range(y_all.shape[1]):
            y_pred += y_all[:,i].reshape(-1,1)*gb_stack.feature_importances_[i]
        y_pred = np.squeeze(y_pred)

        # Directional Accuracy Pred
        changes = y_pred.reshape(-1,1) -  x_tmp['Close(t)'].values.reshape(-1,1)
        y_pred_da = []
        for change in changes:
            y_pred_da.append(1 if change >= 0 else 0)
        
        RMSE = np.sqrt(sklearn.metrics.mean_squared_error(y_tmp, y_pred))
        MAE = sklearn.metrics.mean_absolute_error(y_tmp, y_pred)
        MAPE = mean_absolute_percentage_error(y_tmp,  np.squeeze(y_pred))
        DA = sklearn.metrics.accuracy_score(y_true_da, y_pred_da)
        print(stock, "\tRMSE: %.2f\t MAE: %.2f \tMAPE: %.2f \tDA: %.2f" % (RMSE, MAE, MAPE, DA))
        RMSEs.append(RMSE)
        MAEs.append(MAE)
        MAPEs.append(MAPE)
        DAs.append(DA)
    
    print('\nmean RMSE:', round(np.mean(RMSEs),2))
    print('mean MAE:', round(np.mean(MAEs),2))
    print('mean MAPE:', round(np.mean(MAPEs),2))
    print('mean DA:', round(np.mean(DAs),4))

# Ensemble

In [85]:
ensemble_evaluator(
    x_test
)

BANPU 	RMSE: 0.23	 MAE: 0.19 	MAPE: 1.09 	DA: 0.49
IRPC 	RMSE: 0.09	 MAE: 0.07 	MAPE: 1.22 	DA: 0.48
PTT 	RMSE: 2.37	 MAE: 1.58 	MAPE: 0.41 	DA: 0.68
BBL 	RMSE: 1.36	 MAE: 1.06 	MAPE: 0.57 	DA: 0.54
KBANK 	RMSE: 2.01	 MAE: 1.58 	MAPE: 0.80 	DA: 0.53
SCB 	RMSE: 1.24	 MAE: 0.99 	MAPE: 0.66 	DA: 0.60
AOT 	RMSE: 0.81	 MAE: 0.59 	MAPE: 1.18 	DA: 0.62
THAI 	RMSE: 0.46	 MAE: 0.34 	MAPE: 1.77 	DA: 0.51
CPF 	RMSE: 0.36	 MAE: 0.28 	MAPE: 1.09 	DA: 0.56
MINT 	RMSE: 0.51	 MAE: 0.41 	MAPE: 1.02 	DA: 0.56
TU 	RMSE: 0.26	 MAE: 0.19 	MAPE: 0.98 	DA: 0.59
SCC 	RMSE: 4.40	 MAE: 3.59 	MAPE: 0.71 	DA: 0.56
CPN 	RMSE: 0.94	 MAE: 0.72 	MAPE: 1.02 	DA: 0.58
CK 	RMSE: 0.34	 MAE: 0.28 	MAPE: 1.00 	DA: 0.49
CPALL 	RMSE: 0.54	 MAE: 0.41 	MAPE: 0.65 	DA: 0.46
HMPRO 	RMSE: 0.17	 MAE: 0.11 	MAPE: 1.04 	DA: 0.53
BDMS 	RMSE: 0.23	 MAE: 0.17 	MAPE: 0.86 	DA: 0.53
BH 	RMSE: 2.70	 MAE: 1.90 	MAPE: 0.96 	DA: 0.45
ADVANC 	RMSE: 1.68	 MAE: 1.23 	MAPE: 0.68 	DA: 0.58
JAS 	RMSE: 0.11	 MAE: 0.08 	MAPE: 1.03 	DA: 0.62
TRUE 	RM

In [13]:
df_describe = pd.concat([y_train.describe(), x_test[['Close(t+1)']].describe(), y_val.describe()], axis=1)
df_describe.columns = ['train', 'test', 'val']
df_describe

Unnamed: 0,train,test,val
count,20466.0,2549.0,2577.0
mean,119.161747,112.913084,120.061855
std,137.589574,135.759812,140.902738
min,2.74,5.0,5.55
25%,19.2,18.6,18.5
50%,41.75,46.0,58.5
75%,181.0,181.0,194.0
max,550.0,526.0,506.0


# Linear Regression

In [15]:
target_stocks = ['BANPU','IRPC','PTT','BBL','KBANK','SCB','AOT','THAI','CPF','MINT',
                 'TU','SCC','CPN','CK','CPALL','HMPRO','BDMS','BH','ADVANC','JAS','TRUE']

In [16]:
regr = sklearn.linear_model.LinearRegression()
regr.fit(x_train, y_train)

evaluator(regr, x_test, le)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

BANPU 	RMSE: 0.22	 MAE: 0.16 	MAPE: 0.92 	DA: 0.42
IRPC 	RMSE: 0.09	 MAE: 0.07 	MAPE: 1.19 	DA: 0.69
PTT 	RMSE: 2.69	 MAE: 1.96 	MAPE: 0.51 	DA: 0.44
BBL 	RMSE: 1.36	 MAE: 1.11 	MAPE: 0.60 	DA: 0.32
KBANK 	RMSE: 2.18	 MAE: 1.70 	MAPE: 0.85 	DA: 0.39
SCB 	RMSE: 1.42	 MAE: 1.14 	MAPE: 0.75 	DA: 0.37
AOT 	RMSE: 0.88	 MAE: 0.68 	MAPE: 1.35 	DA: 0.28
THAI 	RMSE: 0.44	 MAE: 0.32 	MAPE: 1.68 	DA: 0.52
CPF 	RMSE: 0.38	 MAE: 0.29 	MAPE: 1.13 	DA: 0.56
MINT 	RMSE: 0.49	 MAE: 0.38 	MAPE: 0.95 	DA: 0.61
TU 	RMSE: 0.28	 MAE: 0.21 	MAPE: 1.09 	DA: 0.57
SCC 	RMSE: 4.36	 MAE: 3.52 	MAPE: 0.70 	DA: 0.48
CPN 	RMSE: 1.02	 MAE: 0.78 	MAPE: 1.09 	DA: 0.43
CK 	RMSE: 0.33	 MAE: 0.26 	MAPE: 0.93 	DA: 0.40
CPALL 	RMSE: 0.47	 MAE: 0.32 	MAPE: 0.51 	DA: 0.31
HMPRO 	RMSE: 0.17	 MAE: 0.12 	MAPE: 1.07 	DA: 0.66
BDMS 	RMSE: 0.22	 MAE: 0.16 	MAPE: 0.82 	DA: 0.38
BH 	RMSE: 2.79	 MAE: 1.81 	MAPE: 0.91 	DA: 0.30
ADVANC 	RMSE: 1.87	 MAE: 1.40 	MAPE: 0.77 	DA: 0.42
JAS 	RMSE: 0.12	 MAE: 0.10 	MAPE: 1.19 	DA: 0.52
TRUE 	RM

# Decision Tree Regressor

In [32]:
param_dt = {
    'criterion':['mse'],
    'max_depth':[8,10,12,16,20,24,30],
    'max_features':[None],
    'min_samples_leaf':[1,2,3,4,5,6],
    'min_samples_split':[2,3,4,5,6],
    'random_state': [100]
}
grids = grid_search.ParameterGrid(param_dt)

min_MAPE = 9999999
best_param = {}
dt = None
for param in tqdm_notebook(grids):
    model = tree.DecisionTreeRegressor(**param)
    _=model.fit(x_train, y_train)
    MAPE = mean_absolute_percentage_error(np.squeeze(y_val.values), model.predict(x_val))
    if MAPE < min_MAPE:
        min_MAPE = deepcopy(MAPE)
        best_param = deepcopy(param)
        dt = deepcopy(model)
min_MAPE
best_param
dt
evaluator(dt, x_test, le)




1.1873259217718013

{'criterion': 'mse',
 'max_depth': 10,
 'max_features': None,
 'min_samples_leaf': 1,
 'min_samples_split': 3,
 'random_state': 100}

DecisionTreeRegressor(criterion='mse', max_depth=10, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=3, min_weight_fraction_leaf=0.0,
           presort=False, random_state=100, splitter='best')

BANPU 	RMSE: 0.30	 MAE: 0.25 	MAPE: 1.47 	DA: 0.47
IRPC 	RMSE: 0.10	 MAE: 0.08 	MAPE: 1.46 	DA: 0.48
PTT 	RMSE: 2.99	 MAE: 2.11 	MAPE: 0.55 	DA: 0.55
BBL 	RMSE: 1.63	 MAE: 1.29 	MAPE: 0.70 	DA: 0.48
KBANK 	RMSE: 2.84	 MAE: 2.16 	MAPE: 1.09 	DA: 0.50
SCB 	RMSE: 1.48	 MAE: 1.19 	MAPE: 0.79 	DA: 0.54
AOT 	RMSE: 0.98	 MAE: 0.71 	MAPE: 1.40 	DA: 0.56
THAI 	RMSE: 0.43	 MAE: 0.32 	MAPE: 1.66 	DA: 0.56
CPF 	RMSE: 0.39	 MAE: 0.31 	MAPE: 1.18 	DA: 0.56
MINT 	RMSE: 0.53	 MAE: 0.42 	MAPE: 1.03 	DA: 0.51
TU 	RMSE: 0.28	 MAE: 0.21 	MAPE: 1.08 	DA: 0.59
SCC 	RMSE: 5.78	 MAE: 4.71 	MAPE: 0.93 	DA: 0.38
CPN 	RMSE: 1.35	 MAE: 0.96 	MAPE: 1.37 	DA: 0.54
CK 	RMSE: 0.35	 MAE: 0.29 	MAPE: 1.03 	DA: 0.39
CPALL 	RMSE: 0.73	 MAE: 0.54 	MAPE: 0.87 	DA: 0.48
HMPRO 	RMSE: 0.21	 MAE: 0.14 	MAPE: 1.31 	DA: 0.66
BDMS 	RMSE: 0.28	 MAE: 0.21 	MAPE: 1.03 	DA: 0.54
BH 	RMSE: 3.02	 MAE: 2.13 	MAPE: 1.07 	DA: 0.52
ADVANC 	RMSE: 2.00	 MAE: 1.51 	MAPE: 0.83 	DA: 0.54
JAS 	RMSE: 0.13	 MAE: 0.10 	MAPE: 1.19 	DA: 0.48
TRUE 	RM

# Random Forest Regrssor

In [33]:
grids = sklearn.grid_search.ParameterGrid({
    'criterion':['mse'],
    'max_depth':[8,10,12,16,20,24,30],
    'max_features':[None],
    'min_samples_leaf':[1,2,3,4],
    'min_samples_split':[2,3,4],
    'random_state': [100],
    'n_jobs':[-1]
})

min_MAPE = 9999999
best_param = {}
rf = None
for param in tqdm_notebook(grids):
    model = ensemble.RandomForestRegressor(**param)
    _=model.fit(x_train, np.squeeze(y_train.values))
    MAPE = mean_absolute_percentage_error(np.squeeze(y_val.values), dt.predict(x_val))
    if MAPE < min_MAPE:
        min_MAPE = deepcopy(MAPE)
        best_param = deepcopy(param)
        rf = deepcopy(model)
min_MAPE
best_param
rf
evaluator(rf, x_test, le)




1.1873259217718013

{'criterion': 'mse',
 'max_depth': 8,
 'max_features': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_jobs': -1,
 'random_state': 100}

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=8,
           max_features=None, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=100, verbose=0, warm_start=False)

BANPU 	RMSE: 0.35	 MAE: 0.29 	MAPE: 1.69 	DA: 0.47
IRPC 	RMSE: 0.16	 MAE: 0.13 	MAPE: 2.34 	DA: 0.40
PTT 	RMSE: 2.81	 MAE: 2.10 	MAPE: 0.55 	DA: 0.59
BBL 	RMSE: 1.55	 MAE: 1.25 	MAPE: 0.68 	DA: 0.45
KBANK 	RMSE: 2.32	 MAE: 1.88 	MAPE: 0.95 	DA: 0.42
SCB 	RMSE: 1.50	 MAE: 1.20 	MAPE: 0.80 	DA: 0.48
AOT 	RMSE: 0.91	 MAE: 0.69 	MAPE: 1.36 	DA: 0.59
THAI 	RMSE: 0.49	 MAE: 0.40 	MAPE: 2.12 	DA: 0.42
CPF 	RMSE: 0.43	 MAE: 0.34 	MAPE: 1.30 	DA: 0.57
MINT 	RMSE: 0.51	 MAE: 0.40 	MAPE: 1.00 	DA: 0.53
TU 	RMSE: 0.31	 MAE: 0.25 	MAPE: 1.31 	DA: 0.62
SCC 	RMSE: 4.91	 MAE: 3.99 	MAPE: 0.79 	DA: 0.45
CPN 	RMSE: 1.13	 MAE: 0.90 	MAPE: 1.28 	DA: 0.59
CK 	RMSE: 0.42	 MAE: 0.35 	MAPE: 1.26 	DA: 0.34
CPALL 	RMSE: 0.77	 MAE: 0.61 	MAPE: 0.97 	DA: 0.50
HMPRO 	RMSE: 0.22	 MAE: 0.17 	MAPE: 1.58 	DA: 0.49
BDMS 	RMSE: 0.29	 MAE: 0.24 	MAPE: 1.23 	DA: 0.41
BH 	RMSE: 2.79	 MAE: 1.86 	MAPE: 0.93 	DA: 0.53
ADVANC 	RMSE: 1.95	 MAE: 1.46 	MAPE: 0.80 	DA: 0.49
JAS 	RMSE: 0.13	 MAE: 0.09 	MAPE: 1.08 	DA: 0.57
TRUE 	RM

# AdaBoost Regressor

In [35]:
grids = sklearn.grid_search.ParameterGrid({
    'base_estimator':[dt],
    'n_estimators':[10,25,50,75,100,150],
    'learning_rate':[1,0.3,0.1],
    'loss':['linear', 'square', 'exponential'],
    'random_state': [100]
})

min_MAPE = 9999999
best_param = {}
ada = None
for param in tqdm_notebook(grids):
    model = ensemble.AdaBoostRegressor(**param)
    _=model.fit(x_train, np.squeeze(y_train.values))
    MAPE = mean_absolute_percentage_error(np.squeeze(y_val.values), model.predict(x_val))
    if MAPE < min_MAPE:
        min_MAPE = deepcopy(MAPE)
        best_param = deepcopy(param)
        ada = deepcopy(model)
min_MAPE
best_param




1.0882075908441693

{'base_estimator': DecisionTreeRegressor(criterion='mse', max_depth=10, max_features=None,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=3, min_weight_fraction_leaf=0.0,
            presort=False, random_state=100, splitter='best'),
 'learning_rate': 0.1,
 'loss': 'square',
 'n_estimators': 50,
 'random_state': 100}

In [36]:
ada
evaluator(ada, x_test, le)

AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=10, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=3, min_weight_fraction_leaf=0.0,
           presort=False, random_state=100, splitter='best'),
         learning_rate=0.1, loss='square', n_estimators=50,
         random_state=100)

BANPU 	RMSE: 0.25	 MAE: 0.20 	MAPE: 1.15 	DA: 0.47
IRPC 	RMSE: 0.09	 MAE: 0.07 	MAPE: 1.29 	DA: 0.44
PTT 	RMSE: 2.71	 MAE: 1.97 	MAPE: 0.51 	DA: 0.60
BBL 	RMSE: 1.44	 MAE: 1.16 	MAPE: 0.63 	DA: 0.52
KBANK 	RMSE: 2.20	 MAE: 1.76 	MAPE: 0.89 	DA: 0.47
SCB 	RMSE: 1.43	 MAE: 1.14 	MAPE: 0.75 	DA: 0.54
AOT 	RMSE: 0.83	 MAE: 0.62 	MAPE: 1.23 	DA: 0.64
THAI 	RMSE: 0.43	 MAE: 0.32 	MAPE: 1.67 	DA: 0.50
CPF 	RMSE: 0.37	 MAE: 0.29 	MAPE: 1.12 	DA: 0.57
MINT 	RMSE: 0.51	 MAE: 0.40 	MAPE: 0.99 	DA: 0.47
TU 	RMSE: 0.28	 MAE: 0.20 	MAPE: 1.07 	DA: 0.49
SCC 	RMSE: 4.80	 MAE: 4.04 	MAPE: 0.80 	DA: 0.38
CPN 	RMSE: 1.08	 MAE: 0.81 	MAPE: 1.15 	DA: 0.53
CK 	RMSE: 0.36	 MAE: 0.30 	MAPE: 1.07 	DA: 0.35
CPALL 	RMSE: 0.61	 MAE: 0.43 	MAPE: 0.69 	DA: 0.56
HMPRO 	RMSE: 0.19	 MAE: 0.13 	MAPE: 1.23 	DA: 0.49
BDMS 	RMSE: 0.24	 MAE: 0.17 	MAPE: 0.88 	DA: 0.60
BH 	RMSE: 2.73	 MAE: 1.86 	MAPE: 0.94 	DA: 0.49
ADVANC 	RMSE: 1.91	 MAE: 1.44 	MAPE: 0.79 	DA: 0.54
JAS 	RMSE: 0.12	 MAE: 0.08 	MAPE: 1.02 	DA: 0.54
TRUE 	RM

# Gradient Boosting Regressor

In [39]:
ensemble.GradientBoostingRegressor()

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [40]:
grids = sklearn.grid_search.ParameterGrid({
    'criterion':['friedman_mse', 'mse'],
    'n_estimators':[10,25,50,75,100],
    'learning_rate':[0.5],
    'max_depth':[5,10,15,20,25,30],
    'random_state': [100],
    'presort':[True, False]
})

min_MAPE = 9999999
best_param = {}
gb = None
for param in tqdm_notebook(grids):
    model = ensemble.GradientBoostingRegressor(**param)
    _=model.fit(x_train, np.squeeze(y_train.values))
    MAPE = mean_absolute_percentage_error(np.squeeze(y_val.values), model.predict(x_val))
    if MAPE < min_MAPE:
        min_MAPE = deepcopy(MAPE)
        best_param = deepcopy(param)
        gb = deepcopy(model)
min_MAPE
best_param
gb
evaluator(gb, x_test, le)




1.0583020747143799

{'criterion': 'mse',
 'learning_rate': 0.5,
 'max_depth': 10,
 'n_estimators': 50,
 'presort': False,
 'random_state': 100}

GradientBoostingRegressor(alpha=0.9, criterion='mse', init=None,
             learning_rate=0.5, loss='ls', max_depth=10, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=50, presort=False, random_state=100,
             subsample=1.0, verbose=0, warm_start=False)

BANPU 	RMSE: 0.26	 MAE: 0.20 	MAPE: 1.15 	DA: 0.55
IRPC 	RMSE: 0.10	 MAE: 0.08 	MAPE: 1.32 	DA: 0.51
PTT 	RMSE: 2.58	 MAE: 1.58 	MAPE: 0.41 	DA: 0.72
BBL 	RMSE: 1.53	 MAE: 1.06 	MAPE: 0.58 	DA: 0.68
KBANK 	RMSE: 1.89	 MAE: 1.42 	MAPE: 0.72 	DA: 0.61
SCB 	RMSE: 1.35	 MAE: 0.95 	MAPE: 0.63 	DA: 0.58
AOT 	RMSE: 0.85	 MAE: 0.59 	MAPE: 1.16 	DA: 0.69
THAI 	RMSE: 0.51	 MAE: 0.37 	MAPE: 1.93 	DA: 0.54
CPF 	RMSE: 0.41	 MAE: 0.31 	MAPE: 1.19 	DA: 0.53
MINT 	RMSE: 0.62	 MAE: 0.46 	MAPE: 1.15 	DA: 0.50
TU 	RMSE: 0.29	 MAE: 0.20 	MAPE: 1.07 	DA: 0.55
SCC 	RMSE: 5.45	 MAE: 4.02 	MAPE: 0.80 	DA: 0.51
CPN 	RMSE: 1.17	 MAE: 0.84 	MAPE: 1.19 	DA: 0.56
CK 	RMSE: 0.35	 MAE: 0.28 	MAPE: 1.00 	DA: 0.44
CPALL 	RMSE: 0.62	 MAE: 0.46 	MAPE: 0.72 	DA: 0.52
HMPRO 	RMSE: 0.19	 MAE: 0.12 	MAPE: 1.09 	DA: 0.54
BDMS 	RMSE: 0.24	 MAE: 0.18 	MAPE: 0.89 	DA: 0.53
BH 	RMSE: 2.87	 MAE: 2.10 	MAPE: 1.06 	DA: 0.42
ADVANC 	RMSE: 1.82	 MAE: 1.13 	MAPE: 0.63 	DA: 0.60
JAS 	RMSE: 0.12	 MAE: 0.09 	MAPE: 1.08 	DA: 0.44
TRUE 	RM

# XGBoost Regressor

```python 
grids = sklearn.grid_search.ParameterGrid({
    'booster':['gblinear'],
    'n_estimators':[25,50,75,100],
    'learning_rate':[0.5, 0.1],
    'max_depth':[4,6,8,10,12],
    'subsample': [0.8, 1],
    'objective': ['reg:linear', 'reg:gamma','rank:pairwise','reg:tweedie'],
    'eval_metric': ['rmse'],
    'nthread': [6]
    'reg_lambda': [0.8],
    'reg_alpha': [0.2],
})
```

In [44]:
d_train = xgboost.DMatrix(x_train, label=y_train)
d_valid = xgboost.DMatrix(x_val, label=y_val)

grids = sklearn.grid_search.ParameterGrid({
    'booster':['gblinear','dart'],
    'n_estimators':[50,75,100],
#     'learning_rate':[0.3, 0.1, 0.05],
    'max_depth':[8,10,12,14,16],
    'subsample': [1],
    'objective': ['reg:linear','reg:tweedie'],
    'eval_metric': ['rmse'],
#     'nthread': [8]
#     'reg_lambda': [0.8],
#     'reg_alpha': [0.2],
})

min_MAPE = 9999999
best_param = {}
xgb = None
for param in tqdm_notebook(grids):
    model = xgboost.train(
        param, 
        d_train, 
        num_boost_round=500, 
        evals=[(d_train, 'train'), (d_valid, 'valid')], 
        early_stopping_rounds=10,
        verbose_eval=False
    )
    MAPE = mean_absolute_percentage_error(np.squeeze(y_val.values), model.predict(d_valid))
    if MAPE < min_MAPE:
        min_MAPE = deepcopy(MAPE)
        best_param = deepcopy(param)
        xgb = deepcopy(model)
min_MAPE
best_param
evaluator(xgb, x_test, le, isXGB=True)




1.104463241995271

{'booster': 'dart',
 'eval_metric': 'rmse',
 'max_depth': 12,
 'n_estimators': 50,
 'objective': 'reg:linear',
 'subsample': 1}

BANPU 	RMSE: 0.25	 MAE: 0.21 	MAPE: 1.20 	DA: 0.53
IRPC 	RMSE: 0.09	 MAE: 0.07 	MAPE: 1.23 	DA: 0.54
PTT 	RMSE: 2.68	 MAE: 1.79 	MAPE: 0.46 	DA: 0.63
BBL 	RMSE: 1.44	 MAE: 1.13 	MAPE: 0.62 	DA: 0.65
KBANK 	RMSE: 2.00	 MAE: 1.58 	MAPE: 0.80 	DA: 0.56
SCB 	RMSE: 1.40	 MAE: 1.11 	MAPE: 0.73 	DA: 0.55
AOT 	RMSE: 0.79	 MAE: 0.61 	MAPE: 1.21 	DA: 0.58
THAI 	RMSE: 0.46	 MAE: 0.34 	MAPE: 1.82 	DA: 0.52
CPF 	RMSE: 0.39	 MAE: 0.31 	MAPE: 1.18 	DA: 0.50
MINT 	RMSE: 0.52	 MAE: 0.40 	MAPE: 0.98 	DA: 0.58
TU 	RMSE: 0.29	 MAE: 0.21 	MAPE: 1.12 	DA: 0.55
SCC 	RMSE: 4.65	 MAE: 3.66 	MAPE: 0.73 	DA: 0.55
CPN 	RMSE: 1.14	 MAE: 0.88 	MAPE: 1.25 	DA: 0.57
CK 	RMSE: 0.34	 MAE: 0.29 	MAPE: 1.02 	DA: 0.48
CPALL 	RMSE: 0.63	 MAE: 0.49 	MAPE: 0.77 	DA: 0.47
HMPRO 	RMSE: 0.21	 MAE: 0.14 	MAPE: 1.25 	DA: 0.54
BDMS 	RMSE: 0.23	 MAE: 0.18 	MAPE: 0.88 	DA: 0.45
BH 	RMSE: 2.93	 MAE: 2.12 	MAPE: 1.06 	DA: 0.44
ADVANC 	RMSE: 1.94	 MAE: 1.40 	MAPE: 0.77 	DA: 0.52
JAS 	RMSE: 0.12	 MAE: 0.09 	MAPE: 1.14 	DA: 0.48
TRUE 	RM

In [None]:
# xgboost.plot_importance(xgb, height=0.3)

# KNeighbors

In [21]:
grids = sklearn.grid_search.ParameterGrid({
    'algorithm': ['ball_tree'],
    'n_neighbors': [1,3,5,7,9,11,13,15],
    'n_jobs':[-1]
})

min_MAPE = 9999999
best_param = {}
knn = None
for param in tqdm_notebook(grids):
    model = sklearn.neighbors.KNeighborsRegressor(**param)
    _=model.fit(x_train, np.squeeze(y_train.values))
    MAPE = mean_absolute_percentage_error(np.squeeze(y_val.values), model.predict(x_val))
    if MAPE < min_MAPE:
        min_MAPE = deepcopy(MAPE)
        best_param = deepcopy(param)
        knn = deepcopy(model)
min_MAPE
best_param
knn
evaluator(knn, x_test, le)




1.3172243561583186

{'algorithm': 'ball_tree', 'n_jobs': -1, 'n_neighbors': 1}

KNeighborsRegressor(algorithm='ball_tree', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=-1, n_neighbors=1, p=2,
          weights='uniform')

PTT 	RMSE: 2.84	 MAE: 1.40 	MAPE: 0.37 	DA: 0.77

mean RMSE: 2.84
mean MAE: 1.4
mean MAPE: 0.37
mean DA: 0.7682


# Bayesian Ridge

In [16]:
grids = sklearn.grid_search.ParameterGrid({
    'n_iter':[100,300,600],
})

min_MAPE = 9999999
best_param = {}
bay = None
for param in tqdm_notebook(grids):
    model = sklearn.linear_model.BayesianRidge(**param)
    _=model.fit(x_train, np.squeeze(y_train.values))
    MAPE = mean_absolute_percentage_error(np.squeeze(y_val.values), model.predict(x_val))
    if MAPE < min_MAPE:
        min_MAPE = deepcopy(MAPE)
        best_param = deepcopy(param)
        bay = deepcopy(model)
min_MAPE
best_param
bay
evaluator(bay, x_test, le)




1.0640612328514238

{'n_iter': 100}

BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,
       fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=100,
       normalize=False, tol=0.001, verbose=False)

BANPU 	RMSE: 0.22	 MAE: 0.16 	MAPE: 0.92 	DA: 0.42
IRPC 	RMSE: 0.09	 MAE: 0.07 	MAPE: 1.19 	DA: 0.69
PTT 	RMSE: 2.68	 MAE: 1.96 	MAPE: 0.51 	DA: 0.44
BBL 	RMSE: 1.36	 MAE: 1.11 	MAPE: 0.60 	DA: 0.32
KBANK 	RMSE: 2.18	 MAE: 1.70 	MAPE: 0.85 	DA: 0.38
SCB 	RMSE: 1.42	 MAE: 1.14 	MAPE: 0.75 	DA: 0.37
AOT 	RMSE: 0.88	 MAE: 0.68 	MAPE: 1.35 	DA: 0.28
THAI 	RMSE: 0.44	 MAE: 0.32 	MAPE: 1.68 	DA: 0.52
CPF 	RMSE: 0.38	 MAE: 0.29 	MAPE: 1.13 	DA: 0.56
MINT 	RMSE: 0.49	 MAE: 0.38 	MAPE: 0.95 	DA: 0.61
TU 	RMSE: 0.28	 MAE: 0.21 	MAPE: 1.09 	DA: 0.57
SCC 	RMSE: 4.36	 MAE: 3.52 	MAPE: 0.70 	DA: 0.48
CPN 	RMSE: 1.02	 MAE: 0.78 	MAPE: 1.09 	DA: 0.42
CK 	RMSE: 0.33	 MAE: 0.26 	MAPE: 0.93 	DA: 0.40
CPALL 	RMSE: 0.47	 MAE: 0.32 	MAPE: 0.51 	DA: 0.31
HMPRO 	RMSE: 0.17	 MAE: 0.12 	MAPE: 1.07 	DA: 0.66
BDMS 	RMSE: 0.22	 MAE: 0.16 	MAPE: 0.82 	DA: 0.38
BH 	RMSE: 2.79	 MAE: 1.81 	MAPE: 0.91 	DA: 0.30
ADVANC 	RMSE: 1.87	 MAE: 1.40 	MAPE: 0.77 	DA: 0.42
JAS 	RMSE: 0.12	 MAE: 0.10 	MAPE: 1.19 	DA: 0.52
TRUE 	RM

In [140]:
target_stocks = ['THAI']


clf = regr
isXGB = False

MAPEs = []
for stock in target_stocks:
    x_tmp = x_test.loc[x_test['Ticker'] == le.transform([stock])[0]].copy()
    x_tmp = x_tmp.sort_index()
    y_tmp = x_tmp[Horizon].values
    
    x_tmp['MA5'] = x_tmp['Close(t)'].rolling(window=5).mean()
    x_tmp['MA3'] = x_tmp['Close(t)'].rolling(window=3).mean()
    x_tmp['MA1'] = x_tmp['Close(t)'].rolling(window=1).mean()
    
    x_tmp = x_tmp.fillna(method='bfill')

    changes = x_tmp[Horizon] -  x_tmp['Close(t)']
    y_true_da = []
    for change in changes:
        y_true_da.append(1 if change >= 0 else 0)

    y_regr = clf.predict(x_tmp.drop(['Close(t+1)', 'MA1', 'MA3', 'MA5'], axis=1))

    y_pred = x_tmp['MA5'].values.reshape(-1,1)

    changes = y_pred.reshape(-1,1) - x_tmp['Close(t)'].values.reshape(-1,1)
    y_pred_da = []
    for change in changes:
        y_pred_da.append(1 if change >= 0 else 0)

    MAPE = mean_absolute_percentage_error(y_tmp, np.squeeze(y_pred))
    DA = sklearn.metrics.accuracy_score(y_true_da, y_pred_da)

    print(stock, "\tMAPE: %.2f \tDA: %.2f" % (MAPE, DA))

THAI 	MAPE: 2.31 	DA: 0.50


In [137]:
y_regr = np.squeeze(y_regr)

In [144]:
trace0 = go.Scatter(
    x = x_tmp.index,
    y = y_tmp,
    mode='lines+markers',
    name = 'Actual',
    line = dict(
#         color = ('rgb(205, 12, 24)'),
        color = ('rgb(22, 96, 167)'),
        width = 2)
)
trace1 = go.Scatter(
    x = x_tmp.index,
    y = y_regr,
    mode='lines+markers',
    name = 'Baseline',
    line = dict(
#         color = ('rgb(22, 96, 167)'),
        dash = 'dash',
        width = 2)
)

trace2 = go.Scatter(
    x = x_tmp.index,
    y = x_tmp['MA5'],
    mode='lines',
    name = 'MA5',
    line = dict(
#         color = ('rgb(22, 96, 167)'),
        dash = 'dot',
        shape='spline',
        width = 1.7)
)

trace3 = go.Scatter(
    x = x_tmp.index,
    y = x_tmp['MA3'],
    mode='lines',
    name = 'MA3',
    line = dict(
#         color = ('rgb(22, 96, 167)'),
        dash = 'dot',
        shape='spline',
        width = 1.7)
)

trace4 = go.Scatter(
    x = x_tmp.index,
    y = x_tmp['MA1'],
    mode='lines',
    name = 'MA1',
    line = dict(
#         color = ('rgb(22, 96, 167)'),
        dash = 'dot',
        shape='spline',
        width = 1.7
    )
)

trace5 = go.Scatter(
    x = x_tmp.index,
    y = y_ensemble,
    mode='lines+markers',
    name = 'Ensemble',
    line = dict(
        color = ('rgb(22, 96, 167)'),
        width = 1.7
    )
)

data = [trace0, trace2, trace3, trace4]

# Edit the layout
layout = dict(title = '',
              xaxis = dict(title = 'Date'),
              yaxis = dict(title = 'THAI: Stock Price (Baht)'),
              font=dict(size=16),
              height=800,
              width=1600
             )
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [131]:
# THAI
y_ensemble = [18.65686081, 18.59490353, 18.99476844, 19.91732867, 19.89860269,
       19.49855347, 19.49716636, 19.20848431, 19.51467055, 19.5036313 ,
       20.32836417, 20.38392905, 20.2727162 , 20.25953937, 20.40871005,
       20.65961292, 20.53987603, 21.37565549, 21.26743029, 21.49266367,
       21.2012527 , 21.30908291, 21.35185617, 21.30492923, 20.07994631,
       19.83040921, 20.28330966, 19.72363869, 19.35484223, 19.30332741,
       19.20888462, 19.13798367, 19.13798367, 19.41597093, 19.40334324,
       19.74779092, 19.64709672, 19.6332697 , 19.46672466, 19.87120507,
       19.94755775, 20.03441437, 20.02876651, 19.52267065, 19.49839693,
       19.74512779, 19.44507747, 18.91885666, 18.92324108, 18.91465264,
       18.72524137, 18.79779986, 18.76431772, 18.68834958, 18.680613  ,
       18.60573295, 18.92006402, 18.82297524, 19.08170694, 18.95410351,
       19.0205283 , 18.97474323, 18.86238688, 18.62642265, 18.71360639,
       18.62388315, 18.65936432, 17.64675391, 17.08595563, 17.16744556,
       16.7762114 , 16.70600056, 16.76421064, 16.77768497, 16.74223496,
       16.84002283, 16.92019228, 16.91174473, 16.81739935, 16.7448841 ,
       17.18438366, 17.18438366, 17.28362644, 17.2879184 , 17.18850329,
       18.04652012, 17.85143492, 17.84175554, 17.88860441, 17.87615403,
       17.37068962, 17.62893528, 17.55320173, 18.28221771, 18.17569867,
       18.34674071, 18.22991763, 18.23085596, 18.47246414, 18.45476908,
       18.45476908, 19.09142893, 18.82590941, 18.87301484, 18.95887807,
       18.68180209]

In [86]:
# PTT
y_ensemble = [392.69693141, 393.41893551, 392.18058725, 391.97888938,
       390.59441308, 390.55397275, 390.55958894, 389.7176166 ,
       389.97441252, 387.66788834, 387.60394887, 387.79886985,
       387.81471613, 388.84371271, 388.72819374, 388.84390474,
       388.64475558, 388.72419785, 389.99140307, 389.81678123,
       389.776041  , 389.2333899 , 389.18317786, 386.90162369,
       386.90274549, 386.88173874, 385.52439524, 382.19210428,
       382.06313232, 387.15259969, 392.08333113, 388.31681254,
       392.77235678, 393.10488289, 393.09917092, 393.0880607 ,
       393.06623396, 393.33572466, 393.49458621, 395.10459272,
       394.31817443, 393.53791241, 393.52283607, 394.26362443,
       394.22012314, 392.20168805, 392.15808744, 393.42622221,
       390.35856499, 391.93977067, 391.98631589, 383.13344356,
       385.84241997, 385.92737566, 384.98979291, 385.09731556,
       382.70113095, 384.08929127, 384.15572737, 381.64281901,
       385.52864651, 383.38441693, 380.79872375, 379.83433066,
       380.47034704, 384.07876601, 376.78176609, 378.63856599,
       379.91240641, 379.82435767, 379.24112833, 380.42349433,
       380.40910041, 376.65077522, 377.07684036, 375.37260501,
       370.61520608, 370.75602101, 374.00709286, 374.1059552 ,
       374.20679303, 373.99491   , 373.97648314, 370.20436391,
       374.90214301, 374.98758043, 375.04673994, 374.75917438,
       373.79383044, 373.34862822, 376.18522401, 374.49394623,
       374.45354121, 374.63893812, 374.51661709, 375.88124214,
       380.05844033, 379.93954775, 378.06067254, 377.90025498,
       378.73680619, 385.59496098, 386.89729997, 388.33139907,
       388.50809835, 387.65863944, 387.53641219, 385.68720627,
       385.58241181, 381.88629641, 381.79920867, 380.94192688,
       380.90905199, 381.75540968, 381.66290714, 381.70297237,
       382.37886197, 382.58278611, 384.46322115, 382.43996984,
       381.97947315, 382.00447473, 381.98652794, 384.48928423,
       383.70362499, 385.55866212, 390.01011597, 389.89414866,
       392.95068717, 391.90061781, 394.17593966, 393.4257888 ,
       394.30614511, 394.26834773, 395.64901066, 394.28760002,
       394.50581045, 400.33437083, 401.01079839, 400.98594143,
       398.70303431, 398.78294306, 400.13243975, 399.99488741,
       398.35126001, 399.73650573, 399.66806858, 397.46737248,
       397.5112825 , 406.26228419, 405.16008814]

# Stacking Ensemble

In [74]:
stack = np.concatenate((
    dt.predict(x_val).reshape(-1,1),
    rf.predict(x_val).reshape(-1,1),
    ada.predict(x_val).reshape(-1,1),
    gb.predict(x_val).reshape(-1,1),
    xgb.predict(d_valid).reshape(-1,1),
    knn.predict(x_val).reshape(-1,1),
    bay.predict(x_val).reshape(-1,1),
), axis=1)

stack.shape

(2577, 7)

In [77]:
params = {
    'criterion':['friedman_mse'],
    'n_estimators':[50,75,100],
    'learning_rate':[0.1],
    'max_depth':[10,15,20,25],
    'random_state': [100],
    'presort':[True, False]
}

grid = sklearn.model_selection.GridSearchCV(
    ensemble.GradientBoostingRegressor(),
    params,
    n_jobs=-1
)
grid.fit(stack, np.squeeze(y_val.values))
grid.best_score_
grid.best_params_

GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'criterion': ['friedman_mse'], 'n_estimators': [50, 75, 100], 'max_depth': [10, 15, 20, 25], 'learning_rate': [0.1], 'random_state': [100], 'presort': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

0.9918595068033216

{'criterion': 'friedman_mse',
 'learning_rate': 0.1,
 'max_depth': 10,
 'n_estimators': 50,
 'presort': False,
 'random_state': 100}

In [78]:
gb_stack = ensemble.GradientBoostingRegressor(**grid.best_params_)
gb_stack.fit(stack, np.squeeze(y_val.values))

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=10, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=50, presort=False, random_state=100,
             subsample=1.0, verbose=0, warm_start=False)

In [79]:
models = ['dt', 'rf', 'ada', 'gb', 'xgb', 'knn', 'bay']

In [80]:
for model, weight in zip(models, gb_stack.feature_importances_):
    print(round(weight,4), model)

0.0797 dt
0.1311 rf
0.1108 ada
0.1613 gb
0.1014 xgb
0.2145 knn
0.2011 bay


# LSTM

In [None]:
# from keras.models import Sequential
# from keras.layers import Dense, Dropout, LSTM

In [None]:
# train_X = x_train.values
# val_X = x_valid.values
# # test_X = x_test.values

# train_y = y_train.values
# val_y = y_valid.values
# # test_y = y_test.values

# train_X = train_X.reshape(train_X.shape[0], train_X.shape[1], 1)
# val_X = val_X.reshape(val_X.shape[0], val_X.shape[1], 1)
# # test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))

# print(train_X.shape, train_y.shape)
# print(val_X.shape, val_y.shape)
# # print(test_X.shape, test_y.shape)

In [None]:
# model = Sequential()
# model.add(LSTM(64, return_sequences=True, input_shape=(train_X.shape[1], 1)))
# model.add(Dropout(0.3))
# model.add(LSTM(128, return_sequences=True))
# model.add(Dropout(0.3))
# model.add(LSTM(256, return_sequences=True))
# model.add(Dropout(0.3))
# model.add(LSTM(256, return_sequences=True))
# # model.add(Dropout(0.4))
# model.add(LSTM(128))
# model.add(Dense(1))
# model.compile(loss='mean_absolute_error', optimizer='adam')

In [None]:
# from keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping

# checkpoint = ModelCheckpoint(filepath="model/LSTM.h5", 
#                              monitor='val_loss', 
#                              verbose=1, 
#                              save_best_only=True, 
#                              save_weights_only=False, 
#                              mode='auto', 
#                              period=1
#                             )

# earlystopping = EarlyStopping(monitor='val_loss', 
#                       min_delta=0, 
#                       patience=100,
#                       verbose=1, 
#                       mode='auto')

# # access via $ tensorboard --logdir=./logs
# tensorboard = TensorBoard(log_dir='./logs')

In [None]:
# model.fit(x=train_X, 
#           y=train_y,
#           epochs=1000,
#           batch_size=32,
#           validation_data=(val_X, val_y),
#           verbose=1,
#           shuffle=False,
#           callbacks=[checkpoint, earlystopping, tensorboard]
#          )

In [None]:
# evaluator(model, x_test, le, isLSTM=True)

In [None]:
# from keras.layers import Bidirectional

In [None]:
# model = Sequential()
# model.add(Bidirectional(LSTM(128, return_sequences=True), input_shape=(2, train_X.shape[2])))
# model.add(Dropout(0.2))
# model.add(Bidirectional(LSTM(128, return_sequences=True)))
# model.add(Dropout(0.3))
# model.add(Bidirectional(LSTM(128, return_sequences=True)))
# model.add(Dropout(0.4))
# model.add(Bidirectional(LSTM(128)))
# model.add(Dense(1))
# model.compile(loss='mean_absolute_error', optimizer='adam')

In [None]:
# model.fit(x=train_X, 
#           y=train_y,
#           epochs=10000,
#           batch_size=1024,
#           validation_data=(val_X, val_y),
#           verbose=1,
#           shuffle=False,
#           callbacks=[checkpoint, earlystopping, tensorboard]
#          )

In [None]:
# evaluator(model, x_test, le, isLSTM=True)

In [None]:
# data_dim = 16
# timesteps = 8
# num_classes = 10
# np.random.random((10, timesteps, data_dim)).shape

In [None]:
# train_X = x_train.values
# val_X = x_valid.values
# train_X.shape

# train_X = train_X.reshape(train_X.shape[0], train_X.shape[1], 1)
# train_X.shape