# Regression with news (all) with price + text

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [3]:
import numpy as np
import pandas as pd
import os
import copy
import sklearn
import xgboost
import plotly.graph_objs as go
import matplotlib.pyplot as plt

from pythainlp.corpus import stopwords
from datetime import datetime, timedelta
from tqdm import tqdm_notebook
from sklearn import tree, ensemble, svm, grid_search, decomposition, cluster
from copy import deepcopy

% matplotlib inline

target_stocks = ['BANPU','IRPC','PTT','BBL','KBANK','SCB','AOT','THAI','CPF','MINT',
                 'TU','SCC','CPN','CK','CPALL','HMPRO','BDMS','BH','ADVANC','JAS','TRUE']

In [5]:
df_price = pd.read_csv('merged_2013_2018.csv')
df_price['Date'] = pd.to_datetime(df_price['Date'], format='%Y-%m-%d')
df_price = df_price.loc[df_price['Ticker'].isin(target_stocks)]
df_price['Date'] = df_price['Date'].dt.date
df_price = df_price.set_index('Date')
df_price.tail(1)
len(df_price)

df_kaohoon = pd.read_csv('data/kaohoon_all.csv')
df_kaohoon['Date'] = pd.to_datetime(df_kaohoon['Date'], format='%Y-%m-%d')
df_kaohoon = df_kaohoon.set_index('Date')
df_kaohoon = df_kaohoon[:'2018-2-8']
df_kaohoon.index = df_kaohoon.index.date
df_kaohoon.tail(1)
len(df_kaohoon)

df_moneych = pd.read_csv('data/moneychanel_all.csv')
df_moneych['Date'] = pd.to_datetime(df_moneych['Date'], format='%Y-%m-%d')
df_moneych = df_moneych.set_index('Date')
df_moneych = df_moneych[:'2018-2-8']
df_moneych.index = df_moneych.index.date
df_moneych.tail(1)
len(df_moneych)

df_news = pd.concat([df_moneych, df_kaohoon])
'Total:', len(df_news.index)

# Lag & Horizon Construction

In [4]:
N_lags = 3
N_horizon = 1

df_train = []
df_test = []
df_val = []
for stock in tqdm_notebook(target_stocks):
    news_stocks = []
    df_stock = df_news.loc[df_news['Ticker'] == stock]
    prev_date = None
    prev_text = None
    
    pbar = tqdm_notebook(total=len(df_stock))
    for date, row in df_stock.iterrows():
        if prev_date == None:
            prev_date = date
            prev_text = row['Text']
        elif prev_date != date:
            # horizon
            tmp_date = copy.deepcopy(prev_date)
            tmp_date += timedelta(days=1)
            prices = []
            count_lags = 0 
            while count_lags < N_horizon:
                price = df_price.loc[(df_price.index == tmp_date) & (df_price['Ticker'] == stock)].values
                tmp_date += timedelta(days=1)
                if len(price) == 0: continue
                prices.append(price[0][4]) # Close price next day(s)
                count_lags+=1
            
            # lag
            tmp_date = copy.deepcopy(prev_date)
            count_lags = 0 
            while count_lags < N_lags:
                price = df_price.loc[(df_price.index == tmp_date) & (df_price['Ticker'] == stock)].values
                tmp_date -= timedelta(days=1)
                if len(price) == 0: continue
                for val in price[0][:-1]: 
                    if type(val) != str: prices.append(val)
                count_lags+=1

            news_stocks.append([prev_date, stock, prev_text] + prices)
            
            prev_date = date
            prev_text = row['Text']
        elif prev_date == date:
            prev_text += ' '+row['Text']
        
        pbar.update(1)
    pbar.close()

    news_stocks = pd.DataFrame.from_records(news_stocks)
    news_stocks.columns = ['Date', 'Ticker', 'Text', 
                           'Close(t+1)', # 'Close(t+2)','Close(t+3)','Close(t+4)','Close(t+5)',
                           'Open(t)', 'High(t)', 'Low(t)', 'Close(t)',
                           'Open(t-1)', 'High(t-1)', 'Low(t-1)', 'Close(t-1)',
                           'Open(t-2)', 'High(t-2)', 'Low(t-2)', 'Close(t-2)',
#                            'Open(t-3)', 'High(t-3)', 'Low(t-3)', 'Close(t-3)',
#                            'Open(t-4)', 'High(t-4)', 'Low(t-4)', 'Close(t-4)',
#                            'Open(t-5)', 'High(t-5)', 'Low(t-5)', 'Close(t-5)'
                          ]
    news_stocks = news_stocks.set_index('Date')
    
    train_size = int(len(news_stocks) * 0.80)
    test_size = int(len(news_stocks) * 0.10)
    
    train = news_stocks.iloc[:train_size]
    test = news_stocks.iloc[train_size: train_size + test_size]
    val = news_stocks.iloc[train_size + test_size: ]

    print(stock, ':\t',len(train), len(test), len(val))    
    df_train.append(train)
    df_test.append(test)
    df_val.append(val)


df_train = pd.concat(df_train, axis=0)
df_test = pd.concat(df_test, axis=0)
df_val = pd.concat(df_val, axis=0)

len(df_train), len(df_test), len(df_val) 
df_train.head(1)
df_test.head(1)
df_val.head(1)

BANPU :	 960 120 120


IRPC :	 979 122 123


PTT :	 1211 151 152


BBL :	 1031 128 130


KBANK :	 1116 139 141


SCB :	 1126 140 142


AOT :	 1080 135 136


THAI :	 855 106 108


CPF :	 954 119 120


MINT :	 954 119 120


TU :	 690 86 87


SCC :	 1043 130 131


CPN :	 883 110 111


CK :	 996 124 125


CPALL :	 987 123 124


HMPRO :	 912 114 115


BDMS :	 887 110 112


BH :	 861 107 109


ADVANC :	 1138 142 143


JAS :	 846 105 107


TRUE :	 957 119 121



(20466, 2549, 2577)

Unnamed: 0_level_0,Ticker,Text,Close(t+1),Open(t),High(t),Low(t),Close(t),Open(t-1),High(t-1),Low(t-1),Close(t-1),Open(t-2),High(t-2),Low(t-2),Close(t-2)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2015-02-03,BANPU,โบ รก ชี้ แรง ซื้อ หุ้น น้ำมัน หนุน ดัชนี ขึ้น...,26.25,24.4,25.5,24.4,25.5,24.2,24.3,24.1,24.3,24.1,24.4,24.0,24.3


Unnamed: 0_level_0,Ticker,Text,Close(t+1),Open(t),High(t),Low(t),Close(t),Open(t-1),High(t-1),Low(t-1),Close(t-1),Open(t-2),High(t-2),Low(t-2),Close(t-2)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2017-06-13,BANPU,หลักทรัพย์ ปริมาณ หุ้น ที่ ขาย ซอ ร์ต หุ้น มูล...,18.1,17.6,17.9,17.6,17.9,17.6,17.7,17.5,17.6,17.5,17.9,17.4,17.5


Unnamed: 0_level_0,Ticker,Text,Close(t+1),Open(t),High(t),Low(t),Close(t),Open(t-1),High(t-1),Low(t-1),Close(t-1),Open(t-2),High(t-2),Low(t-2),Close(t-2)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2017-10-07,BANPU,หลักทรัพย์ ปริมาณ หุ้น ที่ ขาย ซอ ร์ต หุ้น มูล...,17.4,17.6,17.6,17.3,17.5,17.5,17.6,17.4,17.6,17.6,17.7,17.3,17.5


In [6]:
# df_train.to_csv('data/df_train_news.csv')
# df_test.to_csv('data/df_test_news.csv')
# df_val.to_csv('data/df_val_news.csv')

In [7]:
df_train = pd.read_csv('data/df_train_news.csv')
df_train['Date'] = pd.to_datetime(df_train['Date'], format='%Y-%m-%d')
df_train = df_train.set_index('Date')

df_test = pd.read_csv('data/df_test_news.csv')
df_test['Date'] = pd.to_datetime(df_test['Date'], format='%Y-%m-%d')
df_test = df_test.set_index('Date')

df_val = pd.read_csv('data/df_val_news.csv')
df_val['Date'] = pd.to_datetime(df_val['Date'], format='%Y-%m-%d')
df_val = df_val.set_index('Date')

In [7]:
# df_train = df_train[df_train['Ticker'] != 'BANPU']
# df_train = df_train[df_train['Ticker'] != 'IRPC']
# df_train = df_train[df_train['Ticker'] != 'BBL']
# df_train = df_train[df_train['Ticker'] != 'KBANK']
# df_train = df_train[df_train['Ticker'] != 'THAI']
# df_train = df_train[df_train['Ticker'] != 'MINT']
# df_train = df_train[df_train['Ticker'] != 'TU']
# df_train = df_train[df_train['Ticker'] != 'CPN']
# df_train = df_train[df_train['Ticker'] != 'CPF']
# df_train = df_train[df_train['Ticker'] != 'HMPRO']
# df_train = df_train[df_train['Ticker'] != 'JAS']
# df_train = df_train[df_train['Ticker'] != 'TRUE']

# df_test = df_test[df_test['Ticker'] != 'BANPU']
# df_test = df_test[df_test['Ticker'] != 'IRPC']
# df_test = df_test[df_test['Ticker'] != 'BBL']
# df_test = df_test[df_test['Ticker'] != 'KBANK']
# df_test = df_test[df_test['Ticker'] != 'THAI']
# df_test = df_test[df_test['Ticker'] != 'MINT']
# df_test = df_test[df_test['Ticker'] != 'TU']
# df_test = df_test[df_test['Ticker'] != 'CPN']
# df_test = df_test[df_test['Ticker'] != 'CPF']
# df_test = df_test[df_test['Ticker'] != 'HMPRO']
# df_test = df_test[df_test['Ticker'] != 'JAS']
# df_test = df_test[df_test['Ticker'] != 'TRUE']

# df_val = df_val[df_val['Ticker'] != 'BANPU']
# df_val = df_val[df_val['Ticker'] != 'IRPC']
# df_val = df_val[df_val['Ticker'] != 'BBL']
# df_val = df_val[df_val['Ticker'] != 'KBANK']
# df_val = df_val[df_val['Ticker'] != 'THAI']
# df_val = df_val[df_val['Ticker'] != 'MINT']
# df_val = df_val[df_val['Ticker'] != 'TU']
# df_val = df_val[df_val['Ticker'] != 'CPN']
# df_val = df_val[df_val['Ticker'] != 'CPF']
# df_val = df_val[df_val['Ticker'] != 'HMPRO']
# df_val = df_val[df_val['Ticker'] != 'JAS']
# df_val = df_val[df_val['Ticker'] != 'TRUE']

# target_stocks = ['PTT','SCB','AOT','SCC','CK','CPALL','BH','ADVANC']

In [8]:
df_train = df_train[['Ticker', 'Text', 'Close(t+1)', 'Close(t)', 'Close(t-1)', 'Close(t-2)']]
df_test = df_test[['Ticker', 'Text', 'Close(t+1)', 'Close(t)', 'Close(t-1)', 'Close(t-2)']]
df_val = df_val[['Ticker', 'Text', 'Close(t+1)', 'Close(t)', 'Close(t-1)', 'Close(t-2)']]

In [9]:
df_train.shape
df_test.shape
df_val.shape

(20466, 6)

(2549, 6)

(2577, 6)

# TF-IDF Vetorization

In [10]:
# Thai shtopwords
stop_words = stopwords.words('thai')
stop_words.remove('ขึ้น')
stop_words.remove('ลง')

**max_df** is used for removing terms that appear too frequently, also known as "corpus-specific stop words

**min_df** is used for removing terms that appear too infrequently

In [11]:
vertorizer = sklearn.feature_extraction.text.TfidfVectorizer(
    stop_words=stop_words,
    max_df=0.50,  
    min_df=0.03,
)

In [12]:
tfidf_train = vertorizer.fit_transform(df_train['Text'])
tfidf_test = vertorizer.transform(df_test['Text'])
tfidf_val = vertorizer.transform(df_val['Text'])

df_tfidf_train = pd.DataFrame.from_records(tfidf_train.toarray())
df_tfidf_test = pd.DataFrame.from_records(tfidf_test.toarray())
df_tfidf_val = pd.DataFrame.from_records(tfidf_val.toarray())

df_tfidf_train = df_tfidf_train.set_index(df_train.index)
df_tfidf_test = df_tfidf_test.set_index(df_test.index)
df_tfidf_val = df_tfidf_val.set_index(df_val.index)

df_tfidf_train.shape, df_tfidf_test.shape, df_tfidf_val.shape

# replace Text with TF-IDF vector
x_train = df_train.drop(['Text'], axis=1)
x_test = df_test.drop(['Text'], axis=1)
x_val = df_val.drop(['Text'], axis=1)

x_train = pd.concat([x_train, df_tfidf_train], axis=1)
x_test = pd.concat([x_test, df_tfidf_test], axis=1)
x_val = pd.concat([x_val, df_tfidf_val], axis=1)

# Label Encoding
le = sklearn.preprocessing.LabelEncoder()
x_train['Ticker'] = le.fit_transform(x_train['Ticker'])
x_test['Ticker'] = le.transform(x_test['Ticker'])
x_val['Ticker'] = le.transform(x_val['Ticker'])
# x_train.head(1)
le.classes_

((20466, 1019), (2549, 1019), (2577, 1019))

array(['ADVANC', 'AOT', 'BANPU', 'BBL', 'BDMS', 'BH', 'CK', 'CPALL',
       'CPF', 'CPN', 'HMPRO', 'IRPC', 'JAS', 'KBANK', 'MINT', 'PTT',
       'SCB', 'SCC', 'THAI', 'TRUE', 'TU'], dtype=object)

# Create x_train and y_train

In [13]:
Horizon = 'Close(t+1)'
y_train = x_train[[Horizon]]
x_train = x_train.drop(['Close(t+1)'], axis=1).copy()
x_train.shape, y_train.shape

y_val = x_val[[Horizon]]
x_val = x_val.drop(['Close(t+1)'], axis=1).copy()
x_val.shape, y_val.shape

((20466, 1023), (20466, 1))

((2577, 1023), (2577, 1))

# Feature Selection

In [14]:
# sel = sklearn.feature_selection.VarianceThreshold(threshold=(.999 * (1 - .999)))
# sel.fit_transform(x_train).shape

In [15]:
# sel = sklearn.feature_selection.SelectFromModel(regr, prefit=True)
# x_new = sel.transform(x_train)
# x_new

In [16]:
ward = sklearn.cluster.FeatureAgglomeration(n_clusters=20)
ward.fit_transform(x_train).shape

(20466, 20)

In [17]:
pca = sklearn.decomposition.PCA(20, random_state=100)
pca.fit_transform(x_train).shape

(20466, 20)

## Apply reduction

In [18]:
sel_x_train = np.concatenate((ward.transform(x_train), pca.transform(x_train)), axis=1)
sel_x_val = np.concatenate((ward.transform(x_val), pca.transform(x_val)), axis=1)

# sel_x_train = ward.transform(x_train)
# sel_x_val = ward.transform(x_val)

# sel_x_train = pca.transform(x_train)
# sel_x_val = pca.transform(x_val)

# sel_x_train = deepcopy(x_train)
# sel_x_val = deepcopy(x_val)

sel_x_train.shape, sel_x_val.shape

((20466, 40), (2577, 40))

## Evaluate features

In [109]:
for i in range(51):
    if i%5 != 0 or i < 5: continue
        
    ward = sklearn.cluster.FeatureAgglomeration(n_clusters=20)
    pca = sklearn.decomposition.PCA(i, random_state=100)
    
    _=ward.fit(x_train)
    _=pca.fit(x_train)
    
    sel_x_train = np.concatenate((ward.transform(x_train), pca.transform(x_train)), axis=1)
    sel_x_val = np.concatenate((ward.transform(x_val), pca.transform(x_val)), axis=1)
        
    rf = ensemble.RandomForestRegressor(**{'criterion': 'mse',
     'max_depth': 12,
     'max_features': None,
     'min_samples_leaf': 2,
     'min_samples_split': 2,
     'n_jobs': 8,
     'random_state': 100})
    _=rf.fit(sel_x_train,  np.squeeze(y_train.values))
    print(str(i)+',',round(mean_absolute_percentage_error(np.squeeze(y_val.values), rf.predict(sel_x_val)),4))

5, 1.1901
10, 1.205
15, 1.2256
20, 1.1991
25, 1.2181
30, 1.217
35, 1.2003
40, 1.1985
45, 1.2273
50, 1.2095


# Linear Regression

In [115]:
regr = sklearn.linear_model.LinearRegression()
regr.fit(sel_x_train, y_train)
evaluator(regr, x_test, le, ward=ward, pca=pca)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

BANPU 	RMSE: 0.32	 MAE: 0.25 	MAPE: 1.49 	DA: 0.42
IRPC 	RMSE: 0.21	 MAE: 0.17 	MAPE: 2.90 	DA: 0.47
PTT 	RMSE: 2.71	 MAE: 1.98 	MAPE: 0.51 	DA: 0.46
BBL 	RMSE: 1.40	 MAE: 1.11 	MAPE: 0.60 	DA: 0.39
KBANK 	RMSE: 2.20	 MAE: 1.71 	MAPE: 0.86 	DA: 0.53
SCB 	RMSE: 1.42	 MAE: 1.14 	MAPE: 0.76 	DA: 0.56
AOT 	RMSE: 0.87	 MAE: 0.68 	MAPE: 1.35 	DA: 0.45
THAI 	RMSE: 0.45	 MAE: 0.34 	MAPE: 1.77 	DA: 0.54
CPF 	RMSE: 0.43	 MAE: 0.34 	MAPE: 1.32 	DA: 0.55
MINT 	RMSE: 0.51	 MAE: 0.41 	MAPE: 1.01 	DA: 0.45
TU 	RMSE: 0.31	 MAE: 0.25 	MAPE: 1.29 	DA: 0.52
SCC 	RMSE: 4.37	 MAE: 3.50 	MAPE: 0.69 	DA: 0.49
CPN 	RMSE: 1.04	 MAE: 0.81 	MAPE: 1.13 	DA: 0.48
CK 	RMSE: 0.38	 MAE: 0.31 	MAPE: 1.09 	DA: 0.55
CPALL 	RMSE: 0.53	 MAE: 0.38 	MAPE: 0.60 	DA: 0.46
HMPRO 	RMSE: 0.25	 MAE: 0.19 	MAPE: 1.80 	DA: 0.46
BDMS 	RMSE: 0.29	 MAE: 0.23 	MAPE: 1.15 	DA: 0.51
BH 	RMSE: 2.84	 MAE: 1.86 	MAPE: 0.94 	DA: 0.39
ADVANC 	RMSE: 1.88	 MAE: 1.40 	MAPE: 0.77 	DA: 0.49
JAS 	RMSE: 0.22	 MAE: 0.18 	MAPE: 2.20 	DA: 0.51
TRUE 	RM

# Decistion Tree Regressor

In [104]:
param_dt = {
    'criterion':['friedman_mse'],
    'max_depth':[8,10,15],
    'max_features':[None],
    'min_samples_leaf':[1,2],
    'min_samples_split':[2,4],
    'random_state': [100]
}
grids = sklearn.grid_search.ParameterGrid(param_dt)

min_MAPE = 9999999
best_param = {}
dt = None
for param in tqdm_notebook(grids):
    model = tree.DecisionTreeRegressor(**param)
    _=model.fit(sel_x_train, y_train)
    MAPE = mean_absolute_percentage_error(np.squeeze(y_val.values), model.predict(sel_x_val))
    if MAPE < min_MAPE:
        min_MAPE = deepcopy(MAPE)
        best_param = deepcopy(param)
        dt = deepcopy(model)
min_MAPE
best_param




1.3071729132787637

{'criterion': 'friedman_mse',
 'max_depth': 10,
 'max_features': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'random_state': 100}

In [21]:
dt = tree.DecisionTreeRegressor(**{'criterion': 'friedman_mse',
 'max_depth': 10,
 'max_features': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'random_state': 100})
dt.fit(sel_x_train, y_train)
evaluator(dt, x_test, le, ward=ward, pca=pca)

DecisionTreeRegressor(criterion='friedman_mse', max_depth=10,
           max_features=None, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=100,
           splitter='best')

BANPU 	RMSE: 0.37	 MAE: 0.29 	MAPE: 1.71 	DA: 0.47
IRPC 	RMSE: 0.10	 MAE: 0.08 	MAPE: 1.46 	DA: 0.49
PTT 	RMSE: 3.48	 MAE: 2.46 	MAPE: 0.64 	DA: 0.50
BBL 	RMSE: 1.77	 MAE: 1.36 	MAPE: 0.74 	DA: 0.56
KBANK 	RMSE: 3.43	 MAE: 2.57 	MAPE: 1.30 	DA: 0.44
SCB 	RMSE: 1.54	 MAE: 1.22 	MAPE: 0.80 	DA: 0.56
AOT 	RMSE: 1.00	 MAE: 0.76 	MAPE: 1.51 	DA: 0.58
THAI 	RMSE: 0.47	 MAE: 0.35 	MAPE: 1.86 	DA: 0.54
CPF 	RMSE: 0.43	 MAE: 0.33 	MAPE: 1.28 	DA: 0.49
MINT 	RMSE: 0.53	 MAE: 0.42 	MAPE: 1.04 	DA: 0.51
TU 	RMSE: 0.29	 MAE: 0.21 	MAPE: 1.12 	DA: 0.57
SCC 	RMSE: 6.14	 MAE: 4.79 	MAPE: 0.95 	DA: 0.48
CPN 	RMSE: 1.69	 MAE: 1.29 	MAPE: 1.84 	DA: 0.55
CK 	RMSE: 0.35	 MAE: 0.28 	MAPE: 0.98 	DA: 0.52
CPALL 	RMSE: 0.80	 MAE: 0.61 	MAPE: 0.97 	DA: 0.59
HMPRO 	RMSE: 0.22	 MAE: 0.15 	MAPE: 1.39 	DA: 0.58
BDMS 	RMSE: 0.29	 MAE: 0.22 	MAPE: 1.13 	DA: 0.54
BH 	RMSE: 3.46	 MAE: 2.18 	MAPE: 1.09 	DA: 0.63
ADVANC 	RMSE: 2.34	 MAE: 1.83 	MAPE: 1.00 	DA: 0.53
JAS 	RMSE: 0.15	 MAE: 0.12 	MAPE: 1.42 	DA: 0.53
TRUE 	RM

In [105]:
dt
evaluator(dt, x_test, le)

DecisionTreeRegressor(criterion='friedman_mse', max_depth=10,
           max_features=None, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=100,
           splitter='best')

BANPU 	RMSE: 0.37	 MAE: 0.29 	MAPE: 1.71 	DA: 0.47
IRPC 	RMSE: 0.10	 MAE: 0.08 	MAPE: 1.46 	DA: 0.49
PTT 	RMSE: 3.48	 MAE: 2.46 	MAPE: 0.64 	DA: 0.50
BBL 	RMSE: 1.77	 MAE: 1.36 	MAPE: 0.74 	DA: 0.56
KBANK 	RMSE: 3.43	 MAE: 2.57 	MAPE: 1.30 	DA: 0.44
SCB 	RMSE: 1.54	 MAE: 1.22 	MAPE: 0.80 	DA: 0.56
AOT 	RMSE: 1.00	 MAE: 0.76 	MAPE: 1.51 	DA: 0.58
THAI 	RMSE: 0.47	 MAE: 0.35 	MAPE: 1.86 	DA: 0.54
CPF 	RMSE: 0.43	 MAE: 0.33 	MAPE: 1.28 	DA: 0.49
MINT 	RMSE: 0.53	 MAE: 0.42 	MAPE: 1.04 	DA: 0.51
TU 	RMSE: 0.29	 MAE: 0.21 	MAPE: 1.12 	DA: 0.57
SCC 	RMSE: 6.14	 MAE: 4.79 	MAPE: 0.95 	DA: 0.48
CPN 	RMSE: 1.69	 MAE: 1.29 	MAPE: 1.84 	DA: 0.55
CK 	RMSE: 0.35	 MAE: 0.28 	MAPE: 0.98 	DA: 0.52
CPALL 	RMSE: 0.80	 MAE: 0.61 	MAPE: 0.97 	DA: 0.59
HMPRO 	RMSE: 0.22	 MAE: 0.15 	MAPE: 1.39 	DA: 0.58
BDMS 	RMSE: 0.29	 MAE: 0.22 	MAPE: 1.13 	DA: 0.54
BH 	RMSE: 3.46	 MAE: 2.18 	MAPE: 1.09 	DA: 0.63
ADVANC 	RMSE: 2.34	 MAE: 1.83 	MAPE: 1.00 	DA: 0.53
JAS 	RMSE: 0.15	 MAE: 0.12 	MAPE: 1.42 	DA: 0.53
TRUE 	RM

# Random Forest Regrssor

In [109]:
grids = sklearn.grid_search.ParameterGrid({
    'criterion':['mse'],
    'max_depth':[8,12,16],
    'max_features':[None],
    'min_samples_leaf':[1,2,3],
    'min_samples_split':[2,3,4],
    'random_state': [100],
    'n_jobs':[8]
})

min_MAPE = 9999999
best_param = {}
rf = None
for param in tqdm_notebook(grids):
    model = ensemble.RandomForestRegressor(**param)
    _=model.fit(sel_x_train, np.squeeze(y_train.values))
    MAPE = mean_absolute_percentage_error(np.squeeze(y_val.values), model.predict(sel_x_val))
    if MAPE < min_MAPE:
        min_MAPE = deepcopy(MAPE)
        best_param = deepcopy(param)
        rf = deepcopy(model)
min_MAPE
best_param




1.1991307625211338

{'criterion': 'mse',
 'max_depth': 12,
 'max_features': None,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_jobs': 8,
 'random_state': 100}

In [22]:
rf = ensemble.RandomForestRegressor(**{'criterion': 'mse',
 'max_depth': 12,
 'max_features': None,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'n_jobs': 8,
 'random_state': 100})
rf.fit(sel_x_train,  np.squeeze(y_train.values))
evaluator(rf, x_test, le, ward=ward, pca=pca)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
           max_features=None, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=8,
           oob_score=False, random_state=100, verbose=0, warm_start=False)

BANPU 	RMSE: 0.25	 MAE: 0.19 	MAPE: 1.14 	DA: 0.60
IRPC 	RMSE: 0.09	 MAE: 0.08 	MAPE: 1.32 	DA: 0.51
PTT 	RMSE: 3.03	 MAE: 2.27 	MAPE: 0.59 	DA: 0.56
BBL 	RMSE: 1.74	 MAE: 1.36 	MAPE: 0.74 	DA: 0.48
KBANK 	RMSE: 2.46	 MAE: 2.01 	MAPE: 1.01 	DA: 0.49
SCB 	RMSE: 1.50	 MAE: 1.19 	MAPE: 0.79 	DA: 0.53
AOT 	RMSE: 0.93	 MAE: 0.72 	MAPE: 1.42 	DA: 0.56
THAI 	RMSE: 0.43	 MAE: 0.33 	MAPE: 1.72 	DA: 0.57
CPF 	RMSE: 0.37	 MAE: 0.29 	MAPE: 1.12 	DA: 0.61
MINT 	RMSE: 0.57	 MAE: 0.46 	MAPE: 1.14 	DA: 0.39
TU 	RMSE: 0.28	 MAE: 0.21 	MAPE: 1.08 	DA: 0.58
SCC 	RMSE: 4.93	 MAE: 3.99 	MAPE: 0.79 	DA: 0.53
CPN 	RMSE: 1.18	 MAE: 0.96 	MAPE: 1.37 	DA: 0.65
CK 	RMSE: 0.34	 MAE: 0.28 	MAPE: 1.01 	DA: 0.46
CPALL 	RMSE: 0.73	 MAE: 0.57 	MAPE: 0.90 	DA: 0.52
HMPRO 	RMSE: 0.21	 MAE: 0.14 	MAPE: 1.31 	DA: 0.54
BDMS 	RMSE: 0.36	 MAE: 0.21 	MAPE: 1.06 	DA: 0.46
BH 	RMSE: 2.86	 MAE: 1.93 	MAPE: 0.96 	DA: 0.62
ADVANC 	RMSE: 2.00	 MAE: 1.52 	MAPE: 0.83 	DA: 0.57
JAS 	RMSE: 0.12	 MAE: 0.09 	MAPE: 1.10 	DA: 0.40
TRUE 	RM

In [110]:
rf
evaluator(rf, x_test, le)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
           max_features=None, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=8,
           oob_score=False, random_state=100, verbose=0, warm_start=False)

BANPU 	RMSE: 0.25	 MAE: 0.19 	MAPE: 1.14 	DA: 0.60
IRPC 	RMSE: 0.09	 MAE: 0.08 	MAPE: 1.32 	DA: 0.51
PTT 	RMSE: 3.03	 MAE: 2.27 	MAPE: 0.59 	DA: 0.56
BBL 	RMSE: 1.74	 MAE: 1.36 	MAPE: 0.74 	DA: 0.48
KBANK 	RMSE: 2.46	 MAE: 2.01 	MAPE: 1.01 	DA: 0.49
SCB 	RMSE: 1.50	 MAE: 1.19 	MAPE: 0.79 	DA: 0.53
AOT 	RMSE: 0.93	 MAE: 0.72 	MAPE: 1.42 	DA: 0.56
THAI 	RMSE: 0.43	 MAE: 0.33 	MAPE: 1.72 	DA: 0.57
CPF 	RMSE: 0.37	 MAE: 0.29 	MAPE: 1.12 	DA: 0.61
MINT 	RMSE: 0.57	 MAE: 0.46 	MAPE: 1.14 	DA: 0.39
TU 	RMSE: 0.28	 MAE: 0.21 	MAPE: 1.08 	DA: 0.58
SCC 	RMSE: 4.93	 MAE: 3.99 	MAPE: 0.79 	DA: 0.53
CPN 	RMSE: 1.18	 MAE: 0.96 	MAPE: 1.37 	DA: 0.65
CK 	RMSE: 0.34	 MAE: 0.28 	MAPE: 1.01 	DA: 0.46
CPALL 	RMSE: 0.73	 MAE: 0.57 	MAPE: 0.90 	DA: 0.52
HMPRO 	RMSE: 0.21	 MAE: 0.14 	MAPE: 1.31 	DA: 0.54
BDMS 	RMSE: 0.36	 MAE: 0.21 	MAPE: 1.06 	DA: 0.46
BH 	RMSE: 2.86	 MAE: 1.93 	MAPE: 0.96 	DA: 0.62
ADVANC 	RMSE: 2.00	 MAE: 1.52 	MAPE: 0.83 	DA: 0.57
JAS 	RMSE: 0.12	 MAE: 0.09 	MAPE: 1.10 	DA: 0.40
TRUE 	RM

# AdaBoost Regressor

In [198]:
grids = sklearn.grid_search.ParameterGrid({
    'base_estimator':[rf],
    'n_estimators':[20],
    'learning_rate':[0.3],
    'loss':['square'],
    'random_state': [100]
})

min_MAPE = 9999999
best_param = {}
ada = None
for param in tqdm_notebook(grids):
    model = ensemble.AdaBoostRegressor(**param)
    _=model.fit(sel_x_train, np.squeeze(y_train.values))
    MAPE = mean_absolute_percentage_error(np.squeeze(y_val.values), model.predict(sel_x_val))
    if MAPE < min_MAPE:
        min_MAPE = deepcopy(MAPE)
        best_param = deepcopy(param)
        ada = deepcopy(model)
min_MAPE
best_param




1.1273315494031821

{'base_estimator': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=8,
            oob_score=False, random_state=100, verbose=0, warm_start=False),
 'learning_rate': 0.3,
 'loss': 'square',
 'n_estimators': 20,
 'random_state': 100}

In [23]:
ada = ensemble.AdaBoostRegressor(**{'base_estimator': rf,
 'learning_rate': 0.3,
 'loss': 'square',
 'n_estimators': 20,
 'random_state': 100})
ada.fit(sel_x_train,  np.squeeze(y_train.values))
evaluator(ada, x_test, le, ward=ward, pca=pca)

AdaBoostRegressor(base_estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
           max_features=None, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=8,
           oob_score=False, random_state=100, verbose=0, warm_start=False),
         learning_rate=0.3, loss='square', n_estimators=20,
         random_state=100)

BANPU 	RMSE: 0.23	 MAE: 0.18 	MAPE: 1.08 	DA: 0.57
IRPC 	RMSE: 0.09	 MAE: 0.07 	MAPE: 1.23 	DA: 0.49
PTT 	RMSE: 2.87	 MAE: 2.11 	MAPE: 0.55 	DA: 0.56
BBL 	RMSE: 1.48	 MAE: 1.18 	MAPE: 0.64 	DA: 0.50
KBANK 	RMSE: 2.21	 MAE: 1.77 	MAPE: 0.89 	DA: 0.48
SCB 	RMSE: 1.43	 MAE: 1.11 	MAPE: 0.73 	DA: 0.56
AOT 	RMSE: 0.91	 MAE: 0.70 	MAPE: 1.39 	DA: 0.55
THAI 	RMSE: 0.42	 MAE: 0.31 	MAPE: 1.63 	DA: 0.56
CPF 	RMSE: 0.38	 MAE: 0.29 	MAPE: 1.13 	DA: 0.56
MINT 	RMSE: 0.52	 MAE: 0.41 	MAPE: 1.02 	DA: 0.50
TU 	RMSE: 0.28	 MAE: 0.20 	MAPE: 1.04 	DA: 0.58
SCC 	RMSE: 4.88	 MAE: 3.98 	MAPE: 0.79 	DA: 0.48
CPN 	RMSE: 1.08	 MAE: 0.85 	MAPE: 1.20 	DA: 0.58
CK 	RMSE: 0.35	 MAE: 0.28 	MAPE: 1.00 	DA: 0.46
CPALL 	RMSE: 0.62	 MAE: 0.49 	MAPE: 0.78 	DA: 0.50
HMPRO 	RMSE: 0.19	 MAE: 0.13 	MAPE: 1.20 	DA: 0.53
BDMS 	RMSE: 0.24	 MAE: 0.18 	MAPE: 0.91 	DA: 0.47
BH 	RMSE: 2.65	 MAE: 1.79 	MAPE: 0.90 	DA: 0.57
ADVANC 	RMSE: 1.92	 MAE: 1.41 	MAPE: 0.77 	DA: 0.53
JAS 	RMSE: 0.11	 MAE: 0.08 	MAPE: 1.02 	DA: 0.44
TRUE 	RM

In [199]:
ada
evaluator(ada, x_test, le)

AdaBoostRegressor(base_estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=12,
           max_features=None, max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=8,
           oob_score=False, random_state=100, verbose=0, warm_start=False),
         learning_rate=0.3, loss='square', n_estimators=20,
         random_state=100)

BANPU 	RMSE: 0.23	 MAE: 0.18 	MAPE: 1.08 	DA: 0.57
IRPC 	RMSE: 0.09	 MAE: 0.07 	MAPE: 1.23 	DA: 0.49
PTT 	RMSE: 2.87	 MAE: 2.11 	MAPE: 0.55 	DA: 0.56
BBL 	RMSE: 1.48	 MAE: 1.18 	MAPE: 0.64 	DA: 0.50
KBANK 	RMSE: 2.21	 MAE: 1.77 	MAPE: 0.89 	DA: 0.48
SCB 	RMSE: 1.43	 MAE: 1.11 	MAPE: 0.73 	DA: 0.56
AOT 	RMSE: 0.91	 MAE: 0.70 	MAPE: 1.39 	DA: 0.55
THAI 	RMSE: 0.42	 MAE: 0.31 	MAPE: 1.63 	DA: 0.56
CPF 	RMSE: 0.38	 MAE: 0.29 	MAPE: 1.13 	DA: 0.56
MINT 	RMSE: 0.52	 MAE: 0.41 	MAPE: 1.02 	DA: 0.50
TU 	RMSE: 0.28	 MAE: 0.20 	MAPE: 1.04 	DA: 0.58
SCC 	RMSE: 4.88	 MAE: 3.98 	MAPE: 0.79 	DA: 0.48
CPN 	RMSE: 1.08	 MAE: 0.85 	MAPE: 1.20 	DA: 0.58
CK 	RMSE: 0.35	 MAE: 0.28 	MAPE: 1.00 	DA: 0.46
CPALL 	RMSE: 0.62	 MAE: 0.49 	MAPE: 0.78 	DA: 0.50
HMPRO 	RMSE: 0.19	 MAE: 0.13 	MAPE: 1.20 	DA: 0.53
BDMS 	RMSE: 0.24	 MAE: 0.18 	MAPE: 0.91 	DA: 0.47
BH 	RMSE: 2.65	 MAE: 1.79 	MAPE: 0.90 	DA: 0.57
ADVANC 	RMSE: 1.92	 MAE: 1.41 	MAPE: 0.77 	DA: 0.53
JAS 	RMSE: 0.11	 MAE: 0.08 	MAPE: 1.02 	DA: 0.44
TRUE 	RM

# Gradient Boosting Regressor

In [207]:
grids = sklearn.grid_search.ParameterGrid({
#     'criterion':['friedman_mse'],
    'n_estimators':[50],
    'learning_rate':[0.3],
    'max_depth':[8],
    'random_state': [100],
    'min_samples_split': [2],
    'min_samples_leaf': [1]
})

min_MAPE = 9999999
best_param = {}
gb = None
for param in tqdm_notebook(grids):
    model = ensemble.GradientBoostingRegressor(**param)
    _=model.fit(sel_x_train, np.squeeze(y_train.values))
    MAPE = mean_absolute_percentage_error(np.squeeze(y_val.values), model.predict(sel_x_val))
    if MAPE < min_MAPE:
        min_MAPE = deepcopy(MAPE)
        best_param = deepcopy(param)
        gb = deepcopy(model)
min_MAPE
best_param




1.2498595228576765

{'criterion': 'friedman_mse',
 'learning_rate': 0.3,
 'max_depth': 8,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 50,
 'random_state': 100}

In [24]:
gb = ensemble.GradientBoostingRegressor(**{'criterion': 'friedman_mse',
 'learning_rate': 0.3,
 'max_depth': 8,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 50,
 'random_state': 100})
gb.fit(sel_x_train,  np.squeeze(y_train.values))
evaluator(gb, x_test, le, ward=ward, pca=pca)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.3, loss='ls', max_depth=8, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=50, presort='auto', random_state=100,
             subsample=1.0, verbose=0, warm_start=False)

BANPU 	RMSE: 0.25	 MAE: 0.20 	MAPE: 1.17 	DA: 0.53
IRPC 	RMSE: 0.13	 MAE: 0.09 	MAPE: 1.58 	DA: 0.50
PTT 	RMSE: 2.78	 MAE: 1.98 	MAPE: 0.51 	DA: 0.65
BBL 	RMSE: 1.70	 MAE: 1.35 	MAPE: 0.73 	DA: 0.55
KBANK 	RMSE: 2.28	 MAE: 1.86 	MAPE: 0.94 	DA: 0.53
SCB 	RMSE: 1.48	 MAE: 1.18 	MAPE: 0.78 	DA: 0.54
AOT 	RMSE: 0.86	 MAE: 0.62 	MAPE: 1.23 	DA: 0.61
THAI 	RMSE: 0.42	 MAE: 0.29 	MAPE: 1.55 	DA: 0.60
CPF 	RMSE: 0.39	 MAE: 0.30 	MAPE: 1.15 	DA: 0.57
MINT 	RMSE: 0.51	 MAE: 0.43 	MAPE: 1.06 	DA: 0.48
TU 	RMSE: 0.31	 MAE: 0.22 	MAPE: 1.17 	DA: 0.55
SCC 	RMSE: 5.63	 MAE: 4.41 	MAPE: 0.88 	DA: 0.52
CPN 	RMSE: 1.14	 MAE: 0.89 	MAPE: 1.26 	DA: 0.54
CK 	RMSE: 0.34	 MAE: 0.28 	MAPE: 1.00 	DA: 0.45
CPALL 	RMSE: 0.61	 MAE: 0.47 	MAPE: 0.75 	DA: 0.42
HMPRO 	RMSE: 0.22	 MAE: 0.14 	MAPE: 1.29 	DA: 0.50
BDMS 	RMSE: 0.25	 MAE: 0.19 	MAPE: 0.95 	DA: 0.45
BH 	RMSE: 2.84	 MAE: 2.01 	MAPE: 1.02 	DA: 0.53
ADVANC 	RMSE: 2.00	 MAE: 1.58 	MAPE: 0.87 	DA: 0.51
JAS 	RMSE: 0.13	 MAE: 0.09 	MAPE: 1.11 	DA: 0.50
TRUE 	RM

In [208]:
gb
evaluator(gb, x_test, le)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.3, loss='ls', max_depth=8, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=50, presort='auto', random_state=100,
             subsample=1.0, verbose=0, warm_start=False)

BANPU 	RMSE: 0.25	 MAE: 0.20 	MAPE: 1.17 	DA: 0.53
IRPC 	RMSE: 0.13	 MAE: 0.09 	MAPE: 1.58 	DA: 0.50
PTT 	RMSE: 2.78	 MAE: 1.98 	MAPE: 0.51 	DA: 0.65
BBL 	RMSE: 1.70	 MAE: 1.35 	MAPE: 0.73 	DA: 0.55
KBANK 	RMSE: 2.28	 MAE: 1.86 	MAPE: 0.94 	DA: 0.53
SCB 	RMSE: 1.48	 MAE: 1.18 	MAPE: 0.78 	DA: 0.54
AOT 	RMSE: 0.86	 MAE: 0.62 	MAPE: 1.23 	DA: 0.61
THAI 	RMSE: 0.42	 MAE: 0.29 	MAPE: 1.55 	DA: 0.60
CPF 	RMSE: 0.39	 MAE: 0.30 	MAPE: 1.15 	DA: 0.57
MINT 	RMSE: 0.51	 MAE: 0.43 	MAPE: 1.06 	DA: 0.48
TU 	RMSE: 0.31	 MAE: 0.22 	MAPE: 1.17 	DA: 0.55
SCC 	RMSE: 5.63	 MAE: 4.41 	MAPE: 0.88 	DA: 0.52
CPN 	RMSE: 1.14	 MAE: 0.89 	MAPE: 1.26 	DA: 0.54
CK 	RMSE: 0.34	 MAE: 0.28 	MAPE: 1.00 	DA: 0.45
CPALL 	RMSE: 0.61	 MAE: 0.47 	MAPE: 0.75 	DA: 0.42
HMPRO 	RMSE: 0.22	 MAE: 0.14 	MAPE: 1.29 	DA: 0.50
BDMS 	RMSE: 0.25	 MAE: 0.19 	MAPE: 0.95 	DA: 0.45
BH 	RMSE: 2.84	 MAE: 2.01 	MAPE: 1.02 	DA: 0.53
ADVANC 	RMSE: 2.00	 MAE: 1.58 	MAPE: 0.87 	DA: 0.51
JAS 	RMSE: 0.13	 MAE: 0.09 	MAPE: 1.11 	DA: 0.50
TRUE 	RM

# XGBoost Regressor

In [25]:
d_train = xgboost.DMatrix(sel_x_train, label=y_train)
d_valid = xgboost.DMatrix(sel_x_val, label=y_val)

In [26]:
grids = sklearn.grid_search.ParameterGrid({
    'booster':['gblinear'],
    'learning_rate':[1],
#     'max_depth':[4,5,6,7,8],
    'subsample': [1],
    'objective': ['reg:linear'],
    'eval_metric': ['rmse'],
    'nthread': [8],
    'lambda': [0.1],
    'alpha': [0.45],
    'updater': ['coord_descent']
})

min_MAPE = 9999999
best_param = {}
xgb = None
for param in tqdm_notebook(grids):
    model = xgboost.train(
        param, 
        d_train, 
        num_boost_round=500, 
        evals=[(d_train, 'train'), (d_valid, 'valid')], 
        early_stopping_rounds=10,
        verbose_eval=True
    )
    MAPE = mean_absolute_percentage_error(np.squeeze(y_val.values), model.predict(d_valid))
    if MAPE < min_MAPE:
        min_MAPE = deepcopy(MAPE)
        best_param = deepcopy(param)
        xgb = deepcopy(model)
min_MAPE
best_param

[0]	train-rmse:68.3109	valid-rmse:67.2115
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 10 rounds.
[1]	train-rmse:4.69258	valid-rmse:2.05195
[2]	train-rmse:4.66639	valid-rmse:2.03387
[3]	train-rmse:4.66639	valid-rmse:2.03385
[4]	train-rmse:4.66639	valid-rmse:2.03384
[5]	train-rmse:4.66639	valid-rmse:2.03383
[6]	train-rmse:4.66638	valid-rmse:2.03382
[7]	train-rmse:4.66635	valid-rmse:2.03349
[8]	train-rmse:4.66635	valid-rmse:2.03348
[9]	train-rmse:4.66635	valid-rmse:2.03347
[10]	train-rmse:4.66635	valid-rmse:2.03345
[11]	train-rmse:4.66634	valid-rmse:2.03344
[12]	train-rmse:4.66634	valid-rmse:2.03343
[13]	train-rmse:4.66605	valid-rmse:2.03246
[14]	train-rmse:4.66574	valid-rmse:2.03084
[15]	train-rmse:4.66566	valid-rmse:2.02959
[16]	train-rmse:4.6657	valid-rmse:2.02892
[17]	train-rmse:4.66579	valid-rmse:2.02855
[18]	train-rmse:4.66588	valid-rmse:2.02835
[19]	train-rmse:4.66596	valid-rmse:2.02823
[20]	t

1.011604401602321

{'alpha': 0.45,
 'booster': 'gblinear',
 'eval_metric': 'rmse',
 'lambda': 0.1,
 'learning_rate': 1,
 'nthread': 8,
 'objective': 'reg:linear',
 'subsample': 1,
 'updater': 'coord_descent'}

In [27]:
evaluator(xgb, x_test, le, isXGB=True, ward=ward, pca=pca)

BANPU 	RMSE: 0.23	 MAE: 0.17 	MAPE: 1.01 	DA: 0.58
IRPC 	RMSE: 0.08	 MAE: 0.07 	MAPE: 1.18 	DA: 0.69
PTT 	RMSE: 2.68	 MAE: 1.96 	MAPE: 0.51 	DA: 0.45
BBL 	RMSE: 1.34	 MAE: 1.08 	MAPE: 0.59 	DA: 0.34
KBANK 	RMSE: 2.19	 MAE: 1.72 	MAPE: 0.86 	DA: 0.44
SCB 	RMSE: 1.42	 MAE: 1.16 	MAPE: 0.77 	DA: 0.39
AOT 	RMSE: 0.85	 MAE: 0.65 	MAPE: 1.28 	DA: 0.61
THAI 	RMSE: 0.42	 MAE: 0.30 	MAPE: 1.57 	DA: 0.54
CPF 	RMSE: 0.37	 MAE: 0.29 	MAPE: 1.13 	DA: 0.60
MINT 	RMSE: 0.49	 MAE: 0.38 	MAPE: 0.93 	DA: 0.54
TU 	RMSE: 0.24	 MAE: 0.17 	MAPE: 0.87 	DA: 0.47
SCC 	RMSE: 4.37	 MAE: 3.50 	MAPE: 0.69 	DA: 0.49
CPN 	RMSE: 1.01	 MAE: 0.78 	MAPE: 1.09 	DA: 0.68
CK 	RMSE: 0.32	 MAE: 0.27 	MAPE: 0.95 	DA: 0.60
CPALL 	RMSE: 0.47	 MAE: 0.31 	MAPE: 0.50 	DA: 0.40
HMPRO 	RMSE: 0.17	 MAE: 0.11 	MAPE: 1.06 	DA: 0.69
BDMS 	RMSE: 0.21	 MAE: 0.17 	MAPE: 0.85 	DA: 0.62
BH 	RMSE: 2.77	 MAE: 1.77 	MAPE: 0.89 	DA: 0.43
ADVANC 	RMSE: 1.87	 MAE: 1.39 	MAPE: 0.76 	DA: 0.46
JAS 	RMSE: 0.11	 MAE: 0.08 	MAPE: 1.03 	DA: 0.52
TRUE 	RM

In [40]:
evaluator(xgb, x_test, le, isXGB=True)

BANPU 	RMSE: 0.23	 MAE: 0.17 	MAPE: 1.01 	DA: 0.58
IRPC 	RMSE: 0.08	 MAE: 0.07 	MAPE: 1.18 	DA: 0.69
PTT 	RMSE: 2.68	 MAE: 1.96 	MAPE: 0.51 	DA: 0.45
BBL 	RMSE: 1.34	 MAE: 1.08 	MAPE: 0.59 	DA: 0.34
KBANK 	RMSE: 2.19	 MAE: 1.72 	MAPE: 0.86 	DA: 0.44
SCB 	RMSE: 1.42	 MAE: 1.16 	MAPE: 0.77 	DA: 0.39
AOT 	RMSE: 0.85	 MAE: 0.65 	MAPE: 1.28 	DA: 0.61
THAI 	RMSE: 0.42	 MAE: 0.30 	MAPE: 1.57 	DA: 0.54
CPF 	RMSE: 0.37	 MAE: 0.29 	MAPE: 1.13 	DA: 0.60
MINT 	RMSE: 0.49	 MAE: 0.38 	MAPE: 0.93 	DA: 0.54
TU 	RMSE: 0.24	 MAE: 0.17 	MAPE: 0.87 	DA: 0.47
SCC 	RMSE: 4.37	 MAE: 3.50 	MAPE: 0.69 	DA: 0.49
CPN 	RMSE: 1.01	 MAE: 0.78 	MAPE: 1.09 	DA: 0.68
CK 	RMSE: 0.32	 MAE: 0.27 	MAPE: 0.95 	DA: 0.60
CPALL 	RMSE: 0.47	 MAE: 0.31 	MAPE: 0.50 	DA: 0.40
HMPRO 	RMSE: 0.17	 MAE: 0.11 	MAPE: 1.06 	DA: 0.69
BDMS 	RMSE: 0.21	 MAE: 0.17 	MAPE: 0.85 	DA: 0.62
BH 	RMSE: 2.77	 MAE: 1.77 	MAPE: 0.89 	DA: 0.43
ADVANC 	RMSE: 1.87	 MAE: 1.39 	MAPE: 0.76 	DA: 0.46
JAS 	RMSE: 0.11	 MAE: 0.08 	MAPE: 1.03 	DA: 0.52
TRUE 	RM

In [238]:
# xgboost.plot_importance(xgb, height=0.3)

# KNeighbors

In [28]:
grids = sklearn.grid_search.ParameterGrid({
    'algorithm': ['ball_tree'],
    'weights': ['distance'],
    'n_neighbors': [9],
    'leaf_size': [20],
    'p': [2],
    'n_jobs':[-1]
})

min_MAPE = 9999999
best_param = {}
knn = None
for param in tqdm_notebook(grids):
    model = sklearn.neighbors.KNeighborsRegressor(**param)
    _=model.fit(sel_x_train, np.squeeze(y_train.values))
    MAPE = mean_absolute_percentage_error(np.squeeze(y_val.values), model.predict(sel_x_val))
    if MAPE < min_MAPE:
        min_MAPE = deepcopy(MAPE)
        best_param = deepcopy(param)
        knn = deepcopy(model)
min_MAPE
best_param




1.2119618123338594

{'algorithm': 'ball_tree',
 'leaf_size': 20,
 'n_jobs': -1,
 'n_neighbors': 9,
 'p': 2,
 'weights': 'distance'}

In [29]:
knn
evaluator(knn, x_test, le, ward=ward, pca=pca)

KNeighborsRegressor(algorithm='ball_tree', leaf_size=20, metric='minkowski',
          metric_params=None, n_jobs=-1, n_neighbors=9, p=2,
          weights='distance')

BANPU 	RMSE: 0.26	 MAE: 0.20 	MAPE: 1.16 	DA: 0.62
IRPC 	RMSE: 0.13	 MAE: 0.10 	MAPE: 1.76 	DA: 0.44
PTT 	RMSE: 2.90	 MAE: 1.96 	MAPE: 0.51 	DA: 0.60
BBL 	RMSE: 1.32	 MAE: 1.06 	MAPE: 0.58 	DA: 0.55
KBANK 	RMSE: 2.27	 MAE: 1.74 	MAPE: 0.88 	DA: 0.66
SCB 	RMSE: 1.47	 MAE: 1.11 	MAPE: 0.73 	DA: 0.58
AOT 	RMSE: 0.92	 MAE: 0.71 	MAPE: 1.42 	DA: 0.53
THAI 	RMSE: 0.52	 MAE: 0.39 	MAPE: 2.07 	DA: 0.61
CPF 	RMSE: 0.45	 MAE: 0.35 	MAPE: 1.32 	DA: 0.50
MINT 	RMSE: 0.52	 MAE: 0.39 	MAPE: 0.96 	DA: 0.59
TU 	RMSE: 0.30	 MAE: 0.21 	MAPE: 1.12 	DA: 0.60
SCC 	RMSE: 4.12	 MAE: 3.42 	MAPE: 0.68 	DA: 0.55
CPN 	RMSE: 0.86	 MAE: 0.70 	MAPE: 1.00 	DA: 0.66
CK 	RMSE: 0.37	 MAE: 0.29 	MAPE: 1.03 	DA: 0.49
CPALL 	RMSE: 0.61	 MAE: 0.47 	MAPE: 0.75 	DA: 0.43
HMPRO 	RMSE: 0.25	 MAE: 0.17 	MAPE: 1.60 	DA: 0.54
BDMS 	RMSE: 0.26	 MAE: 0.20 	MAPE: 0.99 	DA: 0.49
BH 	RMSE: 2.89	 MAE: 2.03 	MAPE: 1.02 	DA: 0.52
ADVANC 	RMSE: 1.96	 MAE: 1.45 	MAPE: 0.80 	DA: 0.58
JAS 	RMSE: 0.17	 MAE: 0.13 	MAPE: 1.58 	DA: 0.49
TRUE 	RM

KNeighborsRegressor(algorithm='ball_tree', leaf_size=20, metric='minkowski',
          metric_params=None, n_jobs=-1, n_neighbors=9, p=2,
          weights='distance')

BANPU 	RMSE: 0.26	 MAE: 0.20 	MAPE: 1.16 	DA: 0.62
IRPC 	RMSE: 0.13	 MAE: 0.10 	MAPE: 1.76 	DA: 0.44
PTT 	RMSE: 2.90	 MAE: 1.96 	MAPE: 0.51 	DA: 0.60
BBL 	RMSE: 1.32	 MAE: 1.06 	MAPE: 0.58 	DA: 0.55
KBANK 	RMSE: 2.27	 MAE: 1.74 	MAPE: 0.88 	DA: 0.66
SCB 	RMSE: 1.47	 MAE: 1.11 	MAPE: 0.73 	DA: 0.58
AOT 	RMSE: 0.92	 MAE: 0.71 	MAPE: 1.42 	DA: 0.53
THAI 	RMSE: 0.52	 MAE: 0.39 	MAPE: 2.07 	DA: 0.61
CPF 	RMSE: 0.45	 MAE: 0.35 	MAPE: 1.32 	DA: 0.50
MINT 	RMSE: 0.52	 MAE: 0.39 	MAPE: 0.96 	DA: 0.59
TU 	RMSE: 0.30	 MAE: 0.21 	MAPE: 1.12 	DA: 0.60
SCC 	RMSE: 4.12	 MAE: 3.42 	MAPE: 0.68 	DA: 0.55
CPN 	RMSE: 0.86	 MAE: 0.70 	MAPE: 1.00 	DA: 0.66
CK 	RMSE: 0.37	 MAE: 0.29 	MAPE: 1.03 	DA: 0.49
CPALL 	RMSE: 0.61	 MAE: 0.47 	MAPE: 0.75 	DA: 0.43
HMPRO 	RMSE: 0.25	 MAE: 0.17 	MAPE: 1.60 	DA: 0.54
BDMS 	RMSE: 0.26	 MAE: 0.20 	MAPE: 0.99 	DA: 0.49
BH 	RMSE: 2.89	 MAE: 2.03 	MAPE: 1.02 	DA: 0.52
ADVANC 	RMSE: 1.96	 MAE: 1.45 	MAPE: 0.80 	DA: 0.58
JAS 	RMSE: 0.17	 MAE: 0.13 	MAPE: 1.58 	DA: 0.49
TRUE 	RM

# Bayesian Ridge

In [30]:
grids = sklearn.grid_search.ParameterGrid({
    'n_iter':[5,10,20],
    'tol': [0.01, 0.1, 1],
    'alpha_1': [0.1],
    'lambda_1': [0.1]
})

min_MAPE = 9999999
best_param = {}
bay = None
for param in tqdm_notebook(grids):
    model = sklearn.linear_model.BayesianRidge(**param)
    _=model.fit(sel_x_train, np.squeeze(y_train.values))
    MAPE = mean_absolute_percentage_error(np.squeeze(y_val.values), model.predict(sel_x_val))
    if MAPE < min_MAPE:
        min_MAPE = deepcopy(MAPE)
        best_param = deepcopy(param)
        bay = deepcopy(model)
min_MAPE
best_param




1.1096049449259355

{'alpha_1': 0.1, 'lambda_1': 0.1, 'n_iter': 5, 'tol': 1}

In [31]:
bay
evaluator(bay, x_test, le, ward=ward, pca=pca)

BayesianRidge(alpha_1=0.1, alpha_2=1e-06, compute_score=False, copy_X=True,
       fit_intercept=True, lambda_1=0.1, lambda_2=1e-06, n_iter=5,
       normalize=False, tol=1, verbose=False)

BANPU 	RMSE: 0.23	 MAE: 0.16 	MAPE: 0.96 	DA: 0.45
IRPC 	RMSE: 0.10	 MAE: 0.08 	MAPE: 1.45 	DA: 0.64
PTT 	RMSE: 2.69	 MAE: 1.97 	MAPE: 0.51 	DA: 0.44
BBL 	RMSE: 1.35	 MAE: 1.10 	MAPE: 0.60 	DA: 0.32
KBANK 	RMSE: 2.18	 MAE: 1.70 	MAPE: 0.85 	DA: 0.42
SCB 	RMSE: 1.42	 MAE: 1.14 	MAPE: 0.75 	DA: 0.46
AOT 	RMSE: 0.88	 MAE: 0.69 	MAPE: 1.36 	DA: 0.27
THAI 	RMSE: 0.44	 MAE: 0.32 	MAPE: 1.66 	DA: 0.54
CPF 	RMSE: 0.38	 MAE: 0.30 	MAPE: 1.15 	DA: 0.57
MINT 	RMSE: 0.49	 MAE: 0.39 	MAPE: 0.96 	DA: 0.56
TU 	RMSE: 0.28	 MAE: 0.21 	MAPE: 1.12 	DA: 0.55
SCC 	RMSE: 4.36	 MAE: 3.51 	MAPE: 0.70 	DA: 0.48
CPN 	RMSE: 1.02	 MAE: 0.78 	MAPE: 1.10 	DA: 0.48
CK 	RMSE: 0.33	 MAE: 0.27 	MAPE: 0.95 	DA: 0.43
CPALL 	RMSE: 0.47	 MAE: 0.32 	MAPE: 0.51 	DA: 0.38
HMPRO 	RMSE: 0.17	 MAE: 0.12 	MAPE: 1.13 	DA: 0.62
BDMS 	RMSE: 0.22	 MAE: 0.17 	MAPE: 0.84 	DA: 0.55
BH 	RMSE: 2.79	 MAE: 1.81 	MAPE: 0.91 	DA: 0.30
ADVANC 	RMSE: 1.87	 MAE: 1.40 	MAPE: 0.77 	DA: 0.42
JAS 	RMSE: 0.14	 MAE: 0.11 	MAPE: 1.37 	DA: 0.50
TRUE 	RM

# Stacking Ensemble

In [32]:
stack = np.concatenate((
    dt.predict(sel_x_val).reshape(-1,1),
    rf.predict(sel_x_val).reshape(-1,1),
    ada.predict(sel_x_val).reshape(-1,1),
    gb.predict(sel_x_val).reshape(-1,1),
    xgb.predict(d_valid).reshape(-1,1),
    knn.predict(sel_x_val).reshape(-1,1),
    bay.predict(sel_x_val).reshape(-1,1),
    sel_x_val
), axis=1)

stack.shape

(2577, 47)

In [148]:
for k in range(15):
#     sel_stack = sklearn.cluster.FeatureAgglomeration(n_clusters=k+1)
    sel_stack = sklearn.decomposition.PCA(k+1, random_state=100)
    sel_x_stack = sel_stack.fit_transform(stack)

    params = {
#         'random_state': 100,
#         'n_jobs':8
    }

    gb_stack = sklearn.linear_model.HuberRegressor(**params)
    _=gb_stack.fit(sel_x_stack, np.squeeze(y_val.values))

    print('MAPE:', k+1, round(mean_absolute_percentage_error(np.squeeze(y_val.values), gb_stack.predict(sel_x_stack)),4))

MAPE: 1 1.0617
MAPE: 2 1.0662
MAPE: 3 1.0539
MAPE: 4 1.0124
MAPE: 5 1.0144
MAPE: 6 0.9938
MAPE: 7 0.9907
MAPE: 8 0.9989
MAPE: 9 0.9983
MAPE: 10 1.0038
MAPE: 11 1.0033
MAPE: 12 1.0026
MAPE: 13 1.001
MAPE: 14 1.0053
MAPE: 15 0.9977


In [33]:
sel_stack = sklearn.decomposition.PCA(7, random_state=100)
sel_x_stack = sel_stack.fit_transform(stack)

In [140]:
# sel_x_stack = deepcopy(stack)
# sel_x_stack.shape

(2577, 7)

In [34]:
grids = sklearn.grid_search.ParameterGrid({
    'fit_intercept': [True],
    'max_iter':[12, 25, 50, 100],
    'alpha':[0.1,0.01,0.001],
    'tol':[0.1,0.001,0.001,0.0001]
})

min_MAPE = 9999999
best_param = {}
gb_stack = None
for param in tqdm_notebook(grids):
    model = sklearn.linear_model.HuberRegressor(**param)
    _=model.fit(sel_x_stack, np.squeeze(y_val.values))
    MAPE = mean_absolute_percentage_error(np.squeeze(y_val.values), model.predict(sel_x_stack))
    if MAPE < min_MAPE:
        min_MAPE = deepcopy(MAPE)
        best_param = deepcopy(param)
        gb_stack = deepcopy(model)
min_MAPE
best_param




0.9888869809470686

{'alpha': 0.01, 'fit_intercept': True, 'max_iter': 100, 'tol': 0.1}




0.9887403202429025

{'alpha': 0.1, 'fit_intercept': True, 'max_iter': 100, 'tol': 0.1}

In [480]:
models = [
    'dt',
    'rf',
    'ada',
    'gb',
    'xgb',
    'knn',
    'bay'
]

for model, weight in zip(models, gb_stack.coef_):
    print(round(weight,4), model)

0.2777 dt
-0.0028 rf
0.0533 ada
0.2537 gb
-0.1057 xgb
-0.2047 knn
-0.0663 bay


In [144]:
# target_stocks = ['BANPU','IRPC','PTT','BBL','KBANK','SCB','AOT','THAI','CPF','MINT',
#                  'TU','SCC','CPN','CK','CPALL','HMPRO','BDMS','BH','ADVANC','JAS','TRUE']
ensemble_evaluator(
    x_test,
    ward=ward,
    pca=pca
)

BANPU 	RMSE: 0.22	 MAE: 0.15 	MAPE: 0.90 	DA: 0.59
IRPC 	RMSE: 0.10	 MAE: 0.08 	MAPE: 1.32 	DA: 0.43
PTT 	RMSE: 2.59	 MAE: 1.83 	MAPE: 0.47 	DA: 0.62
BBL 	RMSE: 1.26	 MAE: 1.01 	MAPE: 0.55 	DA: 0.63
KBANK 	RMSE: 2.07	 MAE: 1.62 	MAPE: 0.81 	DA: 0.66
SCB 	RMSE: 1.37	 MAE: 1.09 	MAPE: 0.72 	DA: 0.59
AOT 	RMSE: 0.84	 MAE: 0.64 	MAPE: 1.27 	DA: 0.54
THAI 	RMSE: 0.44	 MAE: 0.31 	MAPE: 1.65 	DA: 0.57
CPF 	RMSE: 0.39	 MAE: 0.30 	MAPE: 1.14 	DA: 0.50
MINT 	RMSE: 0.48	 MAE: 0.37 	MAPE: 0.92 	DA: 0.61
TU 	RMSE: 0.26	 MAE: 0.18 	MAPE: 0.96 	DA: 0.57
SCC 	RMSE: 4.08	 MAE: 3.42 	MAPE: 0.68 	DA: 0.54
CPN 	RMSE: 0.91	 MAE: 0.73 	MAPE: 1.03 	DA: 0.63
CK 	RMSE: 0.33	 MAE: 0.27 	MAPE: 0.96 	DA: 0.53
CPALL 	RMSE: 0.48	 MAE: 0.36 	MAPE: 0.57 	DA: 0.44
HMPRO 	RMSE: 0.19	 MAE: 0.13 	MAPE: 1.15 	DA: 0.54
BDMS 	RMSE: 0.23	 MAE: 0.18 	MAPE: 0.88 	DA: 0.50
BH 	RMSE: 2.67	 MAE: 1.80 	MAPE: 0.91 	DA: 0.54
ADVANC 	RMSE: 1.82	 MAE: 1.36 	MAPE: 0.75 	DA: 0.60
JAS 	RMSE: 0.11	 MAE: 0.08 	MAPE: 1.02 	DA: 0.50
TRUE 	RM

In [482]:
ensemble_evaluator(
    x_test
)

BANPU 	RMSE: 0.22	 MAE: 0.16 	MAPE: 0.97 	DA: 0.58
IRPC 	RMSE: 0.09	 MAE: 0.07 	MAPE: 1.27 	DA: 0.37
PTT 	RMSE: 2.62	 MAE: 1.91 	MAPE: 0.50 	DA: 0.59
BBL 	RMSE: 1.30	 MAE: 1.05 	MAPE: 0.57 	DA: 0.53
KBANK 	RMSE: 2.10	 MAE: 1.68 	MAPE: 0.84 	DA: 0.60
SCB 	RMSE: 1.40	 MAE: 1.12 	MAPE: 0.74 	DA: 0.57
AOT 	RMSE: 0.84	 MAE: 0.64 	MAPE: 1.28 	DA: 0.55
THAI 	RMSE: 0.42	 MAE: 0.30 	MAPE: 1.56 	DA: 0.61
CPF 	RMSE: 0.39	 MAE: 0.30 	MAPE: 1.17 	DA: 0.47
MINT 	RMSE: 0.47	 MAE: 0.37 	MAPE: 0.92 	DA: 0.53
TU 	RMSE: 0.24	 MAE: 0.16 	MAPE: 0.85 	DA: 0.60
SCC 	RMSE: 4.19	 MAE: 3.46 	MAPE: 0.69 	DA: 0.54
CPN 	RMSE: 0.95	 MAE: 0.76 	MAPE: 1.07 	DA: 0.61
CK 	RMSE: 0.33	 MAE: 0.27 	MAPE: 0.96 	DA: 0.58
CPALL 	RMSE: 0.46	 MAE: 0.33 	MAPE: 0.53 	DA: 0.49
HMPRO 	RMSE: 0.18	 MAE: 0.12 	MAPE: 1.07 	DA: 0.55
BDMS 	RMSE: 0.22	 MAE: 0.17 	MAPE: 0.86 	DA: 0.51
BH 	RMSE: 2.69	 MAE: 1.79 	MAPE: 0.90 	DA: 0.50
ADVANC 	RMSE: 1.81	 MAE: 1.35 	MAPE: 0.74 	DA: 0.64
JAS 	RMSE: 0.10	 MAE: 0.07 	MAPE: 0.89 	DA: 0.46
TRUE 	RM

In [40]:
target_stocks = ['THAI']

In [41]:
clf = xgb
isXGB = True

MAPEs = []
for stock in target_stocks:
    x_tmp = x_test.loc[x_test['Ticker'] == le.transform([stock])[0]].copy()
    x_tmp = x_tmp.sort_index()
    
    y_tmp = x_tmp[Horizon].values

    changes = x_tmp[Horizon] -  x_tmp['Close(t)']
    y_true_da = []
    for change in changes:
        y_true_da.append(1 if change >= 0 else 0)

    x_tmp = x_tmp.drop(['Close(t+1)'], axis=1)

    sel_x_tmp = np.concatenate((ward.transform(x_tmp.values), pca.transform(x_tmp.values)), axis=1)

    if isXGB:
        y_pred = clf.predict(xgboost.DMatrix(sel_x_tmp))
    else:
        y_pred = clf.predict(sel_x_tmp)

    changes = y_pred.reshape(-1,1) - x_tmp['Close(t)'].values.reshape(-1,1)
    y_pred_da = []
    for change in changes:
        y_pred_da.append(1 if change >= 0 else 0)

    MAPE = mean_absolute_percentage_error(y_tmp, y_pred)
    DA = sklearn.metrics.accuracy_score(y_true_da, y_pred_da)

    print(stock, "\tMAPE: %.2f \tDA: %.2f" % (MAPE, DA))

THAI 	MAPE: 1.57 	DA: 0.54


In [42]:
MAPEs = []
for stock in target_stocks:
    x_tmp = x_test.loc[x_test['Ticker'] == le.transform([stock])[0]].copy()
    x_tmp = x_tmp.sort_index()
    y_tmp = x_tmp[Horizon].values
        
    # Directional Accuracy
    changes = x_tmp[Horizon] - x_tmp['Close(t)']
    y_true_da = []
    for change in changes:
        y_true_da.append(1 if change >= 0 else 0)

    x_tmp = x_tmp.drop([Horizon], axis=1)

    # Prediction
    sel_x_tmp = np.concatenate((ward.transform(x_tmp.values), pca.transform(x_tmp.values)), axis=1)

    y_dt = dt.predict(sel_x_tmp)
    y_rf = rf.predict(sel_x_tmp)
    y_ada = ada.predict(sel_x_tmp)
    y_gb = gb.predict(sel_x_tmp)
    y_xgb = xgb.predict(xgboost.DMatrix(sel_x_tmp))
    y_knn = knn.predict(sel_x_tmp)
    y_bay = bay.predict(sel_x_tmp)


    y_pred = (y_dt+y_rf+y_ada+y_gb+y_xgb+y_knn+y_bay)/7

    y_stack = np.concatenate((
        y_dt.reshape(-1,1),
        y_rf.reshape(-1,1),
        y_ada.reshape(-1,1),
        y_gb.reshape(-1,1),
        y_xgb.reshape(-1,1),
        y_knn.reshape(-1,1),
        y_bay.reshape(-1,1),
        sel_x_tmp
    ), axis=1)

    sel_y_stack = sel_stack.transform(y_stack)
    y_pred = gb_stack.predict(sel_y_stack)

    changes = y_pred.reshape(-1,1) - x_tmp['Close(t)'].values.reshape(-1,1)
    y_pred_da = []
    for change in changes:
        y_pred_da.append(1 if change >= 0 else 0)
        
    MAPE = mean_absolute_percentage_error(y_tmp, y_pred)
    DA = sklearn.metrics.accuracy_score(y_true_da, y_pred_da)
        
    print(stock, "\tMAPE: %.2f \tDA: %.2f" % (MAPE, DA))

THAI 	MAPE: 1.56 	DA: 0.61


In [47]:
# y_concat = np.concatenate((y_tmp.reshape(-1,1), np.array(y_price).reshape(-1,1), y_pred.reshape(-1,1)), axis=1)
# df_concat = pd.DataFrame.from_records(y_concat, columns=['actual', 'price', 'predict'])
# df_concat

In [48]:
y_price = [184.88363473, 184.90108689, 186.87773514, 186.87773514,
       190.33859785, 191.37324646, 190.42152105, 189.92359746,
       189.91238063, 189.91238063, 190.40074789, 190.40074789,
       190.40681379, 190.40681379, 187.46373287, 189.88205114,
       188.91504237, 188.91504237, 190.3886161 , 189.4197774 ,
       189.4197774 , 187.94254381, 187.94254381, 187.91919518,
       188.89335425, 188.89335425, 189.8873715 , 191.37748242,
       191.89177378, 189.45085242, 191.87781205, 191.89526421,
       191.90814154, 193.38096973, 192.41728197, 192.9038193 ,
       192.9038193 , 193.88661973, 195.37415518, 195.37415518,
       197.85221746, 197.85221746, 198.87215881, 198.4062252 ,
       198.4062252 , 196.93248205, 196.93248205, 198.87547981,
       198.87547981, 198.87547981, 198.892017  , 198.892017  ,
       197.92043341, 195.94453069, 195.94453069, 195.94453069,
       198.86077255, 202.81440792, 202.8783879 , 202.8783879 ,
       201.91710616, 201.91710616, 201.91710616, 201.91710616,
       201.91710616, 201.91710616, 201.91710616, 201.91710616,
       199.94120344, 197.45707527, 198.39832937, 195.45192746,
       196.89351119, 196.89351119, 198.36908428, 197.41312291,
       198.39060298, 197.41569838, 197.41569838, 192.49929021,
       189.97876664, 189.92268249, 189.92268249, 188.43697697,
       188.43697697, 188.41877928, 188.41877928, 188.41877928,
       188.90199561, 193.32654608, 193.38371463, 193.38371463,
       193.89783656, 193.89783656, 193.41295972, 193.41295972,
       192.42758383, 190.94004838, 189.93481429, 190.89684156,
       190.89684156, 190.90382242, 194.34557247, 197.33369014,
       196.40622833, 197.39143479, 197.39143479, 199.8531293 ,
       202.83426611, 204.84730975, 204.84730975, 200.95948429,
       200.95948429, 203.86691536, 203.86691536, 203.88270702,
       201.93438889, 201.93438889, 200.92823984, 200.92823984,
       204.83334802, 207.82238066, 207.82238066, 212.78880708,
       210.90114794, 210.902639  , 217.76553537, 213.9229161 ,
       213.9229161 , 215.87421636, 215.87421636, 215.87787622,
       215.87787622, 212.9425217 , 209.96046993, 205.9810799 ,
       205.9810799 , 208.8627563 , 207.8966625 ]

In [49]:
y_price = [18.69449282, 18.69449282, 19.087247  , 20.07398518, 20.07398518,
       19.69542317, 19.69542317, 19.40115574, 19.69002147, 19.69002147,
       20.4776241 , 20.4776241 , 20.39068627, 20.39068627, 20.49178238,
       20.7870461 , 20.59482364, 21.47763948, 21.48752791, 21.59035229,
       21.29699983, 21.39206393, 21.39206393, 21.39206393, 20.11528073,
       19.80545885, 20.38425438, 19.89904545, 19.50331592, 19.50331592,
       19.39769919, 19.2962371 , 19.2962371 , 19.58907446, 19.58907446,
       19.88676455, 19.79376082, 19.79376082, 19.69590437, 19.98874174,
       20.09005473, 20.19100173, 20.19100173, 19.70178728, 19.69623647,
       19.8900381 , 19.69608736, 19.10555992, 19.10555992, 19.10555992,
       18.80268501, 18.89414346, 18.89381136, 18.79613791, 18.79613791,
       18.69673618, 19.08776209, 18.99391117, 19.289324  , 19.1942599 ,
       19.1942599 , 19.09640346, 18.99648664, 18.70019273, 18.79422664,
       18.79389454, 18.79389454, 17.81252417, 17.11307255, 17.11307255,
       16.70667518, 16.70667518, 16.69821681, 16.69821681, 16.69615644,
       16.79434498, 16.89374671, 16.89547498, 16.79780153, 16.79780153,
       17.18934253, 17.18934253, 17.39005725, 17.39005725, 17.29635544,
       18.17986936, 17.99389579, 17.99389579, 17.99610527, 17.99507508,
       17.50413235, 17.69444355, 17.69429444, 18.38264445, 18.29294816,
       18.49171773, 18.39544045, 18.39544045, 18.59163454, 18.59163454,
       18.59163454, 19.18267709, 18.99460926, 18.99527346, 19.09243182,
       18.79907936]

In [50]:
# TU
y_price = [19.70877885, 19.70999203, 19.51413003, 19.31532658, 19.31187004,
       19.60540549, 19.51085648, 19.70756567, 19.70756567, 19.80766549,
       20.10447449, 19.91225203, 19.91225203, 19.91137095, 19.91137095,
       19.91137095, 20.00852931, 20.00852931, 20.00974249, 19.81388049,
       19.81145413, 19.81042395, 19.90861249, 19.90861249, 19.81163712,
       19.81093904, 19.7122354 , 19.90739931, 19.90931058, 19.90931058,
       19.91034076, 19.91034076, 20.2049064 , 20.2049064 , 20.30673449,
       20.30949295, 20.30949295, 20.0154424 , 20.0154424 , 20.0154424 ,
       19.71723722, 19.81024095, 19.71172031, 19.71102222, 20.10326131,
       20.10326131, 20.00992548, 19.71620704, 19.41748677, 19.41748677,
       19.5104905 , 19.5104905 , 19.41196985, 19.41196985, 19.01851758,
       19.01314977, 19.10927794, 19.11049112, 19.11049112, 18.71825203,
       18.9097764 , 18.61557675, 18.31840176, 18.21502839, 18.21502839,
       18.40864703, 18.50874684, 18.41280166, 18.41280166, 18.51029212,
       18.51029212, 18.41280166, 18.41280166, 18.11753793, 18.01519476,
       18.0124363 , 18.0124363 , 18.99380667, 18.71137282, 18.61469567,
       18.51374866, 18.11926621, 18.11926621, 18.11926621, 18.11926621,
       18.1138984 ]

In [51]:
# BBL
y_price = [179.83995857, 179.83646813, 180.81577813, 180.33696719,
       180.33696719, 181.31793769, 179.85466583, 179.85466583,
       180.82350453, 180.82350453, 179.84602446, 178.85715814,
       178.85715814, 179.82176087, 179.82874174, 179.83389267,
       178.85200721, 179.82176087, 180.31968447, 180.82184403,
       179.84859993, 180.82092906, 180.82790992, 181.32400359,
       181.82101222, 181.82101222, 183.30248177, 183.30248177,
       183.81419766, 184.80987542, 184.80987542, 184.82458268,
       184.33879088, 184.33879088, 185.31461045, 185.31461045,
       185.32416678, 184.83837498, 184.83837498, 184.83837498,
       186.79608   , 187.30871086, 189.28884954, 188.33380314,
       188.33380314, 188.33380314, 188.33380314, 188.33380314,
       188.33197321, 188.33197321, 185.87210862, 185.87210862,
       184.36895094, 182.37410499, 182.37410499, 180.37834408,
       180.37834408, 179.85283589, 179.83646813, 180.3248354 ,
       181.31278676, 181.31278676, 179.3637231 , 179.34461044,
       179.8252513 , 179.8252513 , 181.79508812, 181.79508812,
       178.8762708 , 178.8762708 , 178.8762708 , 180.32300546,
       180.81669309, 180.83048539, 181.32400359, 180.83912675,
       180.83912675, 180.34469359, 178.3722813 , 178.83638498,
       179.32309174, 179.32309174, 180.8045613 , 183.28004811,
       183.318104  , 181.36721041, 181.34294681, 181.34294681,
       182.80547314, 185.76932722, 185.81344901, 184.35607361,
       185.81070411, 185.81070411, 185.81070411, 185.81070411,
       185.82117541, 183.37418815, 183.37418815, 183.83480139,
       183.83480139, 185.79176088, 186.30954267, 186.30954267,
       188.78062409, 187.83164358, 187.83238911, 189.7910091 ,
       190.30621543, 190.30621543, 190.30621543, 190.30621543,
       190.30621543, 191.30446865, 189.35540499, 187.37252141,
       186.84701322, 185.35781727, 185.35781727, 186.31892957,
       186.31892957, 185.83239224, 185.83239224, 185.34053454,
       184.35000771, 186.29907137, 186.29907137, 188.77289769]

In [58]:
# PTT
y_price = [391.79651198, 392.74566192, 391.77075732, 391.77075732,
       389.80000554, 389.80000554, 389.80000554, 388.78870555,
       389.74815735, 387.79136729, 387.79136729, 387.77225463,
       387.77225463, 388.74383823, 388.74383823, 388.74383823,
       388.75597003, 388.76112096, 389.74300642, 389.75513822,
       389.75513822, 388.77840369, 388.77840369, 386.80250097,
       386.80250097, 386.80250097, 385.79120098, 380.85934002,
       380.85934002, 385.70295741, 392.61105996, 388.79419535,
       391.72738107, 392.72505819, 392.72505819, 392.72505819,
       392.72505819, 392.75264278, 392.75779372, 394.72156464,
       393.76394277, 392.78022738, 392.78022738, 393.74483011,
       393.74483011, 391.78804005, 391.78804005, 392.75081285,
       389.8069864 , 390.7576274 , 390.7576274 , 382.89922271,
       384.7710902 , 384.7710902 , 384.75414634, 384.75414634,
       381.81879182, 382.76428189, 382.76428189, 380.79718997,
       384.70561915, 383.76195901, 379.8428891 , 378.80732552,
       379.75647546, 383.69099817, 376.87147806, 377.78904467,
       379.72889087, 379.72889087, 378.77641993, 379.75647546,
       379.75647546, 376.81779994, 376.81779994, 374.82278456,
       369.87364086, 369.87364086, 372.7483364 , 373.7408626 ,
       373.7408626 , 372.78656173, 372.78656173, 368.85203902,
       373.70778821, 373.70778821, 373.70778821, 373.70778821,
       372.765958  , 371.7976954 , 375.70795452, 373.78755985,
       373.78755985, 373.78755985, 373.78755985, 375.7476709 ,
       379.68917448, 379.74800353, 377.80483633, 377.78057274,
       377.77027088, 384.6434691 , 387.67404806, 387.74649997,
       387.74649997, 387.76195277, 387.76195277, 385.79818185,
       385.79818185, 380.86449095, 380.86449095, 379.81164464,
       379.81164464, 380.75564365, 380.75564365, 380.75564365,
       381.74450997, 381.7617927 , 383.73071456, 381.79120723,
       380.79536003, 380.79536003, 380.79536003, 383.71858276,
       383.74982722, 384.74716547, 388.68683911, 388.68683911,
       392.66805908, 391.75530453, 393.72754738, 392.76477458,
       393.74483011, 393.74483011, 393.74483011, 393.74483011,
       393.75181097, 399.64827467, 401.68483637, 401.68483637,
       398.79434917, 398.79434917, 399.75014111, 399.75014111,
       398.76493465, 399.73983924, 399.73983924, 397.78304919,
       397.78304919, 405.61902022, 405.70577272]

In [43]:
y_pred

array([18.65686081, 18.59490353, 18.99476844, 19.91732867, 19.89860269,
       19.49855347, 19.49716636, 19.20848431, 19.51467055, 19.5036313 ,
       20.32836417, 20.38392905, 20.2727162 , 20.25953937, 20.40871005,
       20.65961292, 20.53987603, 21.37565549, 21.26743029, 21.49266367,
       21.2012527 , 21.30908291, 21.35185617, 21.30492923, 20.07994631,
       19.83040921, 20.28330966, 19.72363869, 19.35484223, 19.30332741,
       19.20888462, 19.13798367, 19.13798367, 19.41597093, 19.40334324,
       19.74779092, 19.64709672, 19.6332697 , 19.46672466, 19.87120507,
       19.94755775, 20.03441437, 20.02876651, 19.52267065, 19.49839693,
       19.74512779, 19.44507747, 18.91885666, 18.92324108, 18.91465264,
       18.72524137, 18.79779986, 18.76431772, 18.68834958, 18.680613  ,
       18.60573295, 18.92006402, 18.82297524, 19.08170694, 18.95410351,
       19.0205283 , 18.97474323, 18.86238688, 18.62642265, 18.71360639,
       18.62388315, 18.65936432, 17.64675391, 17.08595563, 17.16

In [60]:
trace0 = go.Scatter(
    x = x_tmp.index,
    y = y_tmp,
    mode='lines+markers',
    name = 'Actual',
    line = dict(
        color = ('rgb(205, 12, 24)'),
        width = 2)
)
trace1 = go.Scatter(
    x = x_tmp.index,
    y = y_pred,
    mode='lines+markers',
    name = 'Ensemble',
    line = dict(
        color = ('rgb(22, 96, 167)'),
        width = 2,
#         dash='dash'
    )
)

trace2 = go.Scatter(
    x = x_tmp.index,
    y = y_price,
    mode='lines+markers',
    name = 'Baseline',
    line = dict(
        color = ('rgb(22, 96, 167)'),
        width = 2,
        dash = 'dot',
        
    )
)


data = [trace0, trace1, trace2]

# Edit the layout
layout = dict(title = '',
              xaxis = dict(title = 'Date'),
              yaxis = dict(title = 'PTT: Stock Price (Baht)'),
              font=dict(size=16),
              height=800,
              width=1600
             )
fig = go.Figure(data=data, layout=layout)
iplot(fig)

In [20]:
def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def evaluator(clf, df_test, le, isXGB=False, isLSTM=False,  ward=None, pca=None):
    RMSEs, MAEs, MAPEs, DAs = [], [], [], []
    results = []
    for stock in target_stocks:
        x_tmp = df_test.loc[df_test['Ticker'] == le.transform([stock])[0]].copy()
        
        y_tmp = x_tmp[Horizon].values
        
        # Directional Accuracy
        changes = x_tmp[Horizon] -  x_tmp['Close(t)']
        y_true_da = []
        for change in changes:
            y_true_da.append(1 if change >= 0 else 0)
                
        x_tmp = x_tmp.drop(['Close(t+1)'], axis=1)
        
        # Features Transformation
        if ward is not None and pca is not None:
            sel_x_tmp = np.concatenate((ward.transform(x_tmp.values), pca.transform(x_tmp.values)), axis=1)
        elif ward is not None:
            sel_x_tmp = ward.transform(x_tmp.values)
        elif pca is not None:
            sel_x_tmp = pca.transform(x_tmp.values)
        else:
            sel_x_tmp = deepcopy(x_tmp.values)
            
            

        if isXGB:
            y_pred = clf.predict(xgboost.DMatrix(sel_x_tmp))
        else:
            y_pred = clf.predict(sel_x_tmp)
#             y_pred = clf.predict(x_tmp.as_matrix())

#         # Directional Accuracy Pred
        changes = y_pred.reshape(-1,1) - x_tmp['Close(t)'].values.reshape(-1,1)
        y_pred_da = []
        for change in changes:
            y_pred_da.append(1 if change >= 0 else 0)
        
        RMSE = np.sqrt(sklearn.metrics.mean_squared_error(y_tmp, y_pred))
        MAE = sklearn.metrics.mean_absolute_error(y_tmp, y_pred)
        MAPE = mean_absolute_percentage_error(y_tmp, np.squeeze(y_pred))
        DA = sklearn.metrics.accuracy_score(y_true_da, np.squeeze(y_pred_da))
        print(stock, "\tRMSE: %.2f\t MAE: %.2f \tMAPE: %.2f \tDA: %.2f" % (RMSE, MAE, MAPE, DA))
        RMSEs.append(RMSE)
        MAEs.append(MAE)
        MAPEs.append(MAPE)
        DAs.append(DA)
    
    print('\nmean RMSE:', round(np.mean(RMSEs),2))
    print('mean MAE:', round(np.mean(MAEs),2))
    print('mean MAPE:', round(np.mean(MAPEs),2))
    print('mean DA:', round(np.mean(DAs),4))
    
def ensemble_evaluator(df_test, ward=None, pca=None):
    RMSEs, MAEs, MAPEs, DAs = [], [], [], []
    results = []
    for stock in target_stocks:
        x_tmp = df_test.loc[df_test['Ticker'] == le.transform([stock])[0]].copy()
        
        y_tmp = x_tmp[Horizon].values
        
        # Directional Accuracy
        changes = x_tmp[Horizon] -  x_tmp['Close(t)']
        y_true_da = []
        for change in changes:
            y_true_da.append(1 if change >= 0 else 0)
                
        x_tmp = x_tmp.drop(['Close(t+1)'], axis=1)
        
        if ward is not None and pca is not None:
            sel_x_tmp = np.concatenate((ward.transform(x_tmp.values), pca.transform(x_tmp.values)), axis=1)
        elif ward is not None:
            sel_x_tmp = ward.transform(x_tmp.values)
        elif pca is not None:
            sel_x_tmp = pca.transform(x_tmp.values)
        else:
            sel_x_tmp = deepcopy(x_tmp.values)
        
        # Prediction

        y_dt = dt.predict(sel_x_tmp)
        y_rf = rf.predict(sel_x_tmp)
        y_ada = ada.predict(sel_x_tmp)
        y_gb = gb.predict(sel_x_tmp)
        y_xgb = xgb.predict(xgboost.DMatrix(sel_x_tmp))
        y_knn = knn.predict(sel_x_tmp)
        y_bay = bay.predict(sel_x_tmp)
        

        y_pred = (y_dt+y_rf+y_ada+y_gb+y_xgb+y_knn+y_bay)/7
        
        y_stack = np.concatenate((
            y_dt.reshape(-1,1),
            y_rf.reshape(-1,1),
            y_ada.reshape(-1,1),
            y_gb.reshape(-1,1),
            y_xgb.reshape(-1,1),
            y_knn.reshape(-1,1),
            y_bay.reshape(-1,1),
#             sel_x_tmp
        ), axis=1)
        
        sel_y_stack = deepcopy(y_stack)
#         sel_y_stack = sel_stack.transform(y_stack)
        y_pred = gb_stack.predict(sel_y_stack)
        
#         y_pred = np.zeros((y_stack.shape[0],1))
#         for i in range(y_stack.shape[1]):
#             y_pred += y_stack[:,i].reshape(-1,1)*gb_stack.feature_importances_[i]
#         y_pred = np.squeeze(y_pred)

        # Directional Accuracy Pred
        changes = y_pred.reshape(-1,1) -  x_tmp['Close(t)'].values.reshape(-1,1)
        y_pred_da = []
        for change in changes:
            y_pred_da.append(1 if change >= 0 else 0)
        
        RMSE = np.sqrt(sklearn.metrics.mean_squared_error(y_tmp, y_pred))
        MAE = sklearn.metrics.mean_absolute_error(y_tmp, y_pred)
        MAPE = mean_absolute_percentage_error(y_tmp, y_pred)
        DA = sklearn.metrics.accuracy_score(y_true_da, y_pred_da)
        print(stock, "\tRMSE: %.2f\t MAE: %.2f \tMAPE: %.2f \tDA: %.2f" % (RMSE, MAE, MAPE, DA))
        RMSEs.append(RMSE)
        MAEs.append(MAE)
        MAPEs.append(MAPE)
        DAs.append(DA)
    
    print('\nmean RMSE:', round(np.mean(RMSEs),2))
    print('mean MAE:', round(np.mean(MAEs),2))
    print('mean MAPE:', round(np.mean(MAPEs),2))
    print('mean DA:', round(np.mean(DAs),4))