# Regression with Pantip (all)

In [36]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

from talib.abstract import *
import pandas as pd
import os
import copy
import numpy as np

from pythainlp.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score
from sklearn import preprocessing

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

from jupyterthemes import jtplot
jtplot.style()

import plotly.graph_objs as go
from datetime import datetime, timedelta
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
% matplotlib inline

# target_stocks = ['BANPU','IRPC','PTT','BBL','KBANK','SCB','AOT','THAI','CPF','MINT',
#                  'TU','SCC','CPN','CK','CPALL','HMPRO','BDMS','BH','ADVANC','JAS','TRUE']

target_stocks = ['BANPU']

df_price = pd.read_csv('merged_2013_2018.csv')
df_price['Date'] = pd.to_datetime(df_price['Date'], format='%Y-%m-%d')
df_price = df_price.loc[df_price['Ticker'].isin(target_stocks)]
df_price['Date'] = df_price['Date'].dt.date
df_price = df_price.set_index('Date')
df_price.tail(3)
len(df_price)

df_pantip = pd.read_csv('data/pantip_all.csv')
df_pantip['Date'] = pd.to_datetime(df_pantip['Date'], format='%Y-%m-%d')
df_pantip = df_pantip.set_index('Date')
df_pantip = df_pantip.sort_index()
df_pantip = df_pantip['2014-1-1':'2018-2-8']
df_pantip.index = df_pantip.index.date
df_pantip.head(3)
df_pantip.tail(3)

'Total:', len(df_pantip)

Unnamed: 0_level_0,Ticker,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-02-14,BANPU,21.2,21.3,20.8,20.8,53688000
2018-02-15,BANPU,20.9,21.8,20.9,21.8,96710000
2018-02-16,BANPU,21.9,22.0,21.5,21.7,77510900


1254

Unnamed: 0,Ticker,Text
2014-01-01,SCB,หุ้น ธนาคาร ซื้อ ลงทุน พรุ่งนี้ ตัว ไหน ดี ครั...
2014-01-01,BBL,หุ้น ธนาคาร ซื้อ ลงทุน พรุ่งนี้ ตัว ไหน ดี ครั...
2014-01-01,KBANK,หุ้น ธนาคาร ซื้อ ลงทุน พรุ่งนี้ ตัว ไหน ดี ครั...


Unnamed: 0,Ticker,Text
2018-02-08,SCB,วัน พฤหัสบดี ที่ ๘ เดือน กุมภาพันธ์ พุทธ ศักรา...
2018-02-08,PTT,มา วิเคราะห์ กัน ดีกว่า จะ แตก พา ร์ ไหม และ ก...
2018-02-08,KBANK,ทำไม วันนี้ หุ้น น แบงค์ ตัว อื่น ขึ้น แต่ ทำไ...


('Total:', 9873)

# Lag & Horizon Construction

In [37]:
N_lags = 3
N_horizon = 1

df_train = []
df_test = []
for stock in tqdm_notebook(target_stocks):
    news_stocks = []
    df_stock = df_pantip.loc[df_pantip['Ticker'] == stock]
    prev_date = None
    prev_text = None
    
    pbar = tqdm_notebook(total=len(df_stock))
    for date, row in df_stock.iterrows():
        if prev_date == None:
            prev_date = date
            prev_text = row['Text']
        elif prev_date != date:
            # horizon
            tmp_date = copy.deepcopy(prev_date)
            tmp_date += timedelta(days=1)
            prices = []
            count_lags = 0 
            while count_lags < N_horizon:
                price = df_price.loc[(df_price.index == tmp_date) & (df_price['Ticker'] == stock)].values
                tmp_date += timedelta(days=1)
                if len(price) == 0: continue
                prices.append(price[0][4]) # Close price next day(s)
                count_lags+=1
            
            # lag
            tmp_date = copy.deepcopy(prev_date)
            count_lags = 0 
            while count_lags <= N_lags:
                price = df_price.loc[(df_price.index == tmp_date) & (df_price['Ticker'] == stock)].values
                tmp_date -= timedelta(days=1)
                if len(price) == 0: continue
                for val in price[0][:-1]: 
                    if type(val) != str: prices.append(val)
                count_lags+=1

            news_stocks.append([prev_date, stock, prev_text] + prices)
            
            prev_date = date
            prev_text = row['Text']
        elif prev_date == date:
            prev_text += ' '+row['Text']
        
        pbar.update(1)
    pbar.close()

    news_stocks = pd.DataFrame.from_records(news_stocks)
    news_stocks.columns = ['Date', 'Ticker', 'Text', 
                           'Close(t+1)', # 'Close(t+2)','Close(t+3)','Close(t+4)','Close(t+5)',
                           'Open(t)', 'High(t)', 'Low(t)', 'Close(t)',
                           'Open(t-1)', 'High(t-1)', 'Low(t-1)', 'Close(t-1)',
                           'Open(t-2)', 'High(t-2)', 'Low(t-2)', 'Close(t-2)',
                           'Open(t-3)', 'High(t-3)', 'Low(t-3)', 'Close(t-3)',
#                            'Open(t-4)', 'High(t-4)', 'Low(t-4)', 'Close(t-4)',
#                            'Open(t-5)', 'High(t-5)', 'Low(t-5)', 'Close(t-5)'
                          ]
    news_stocks = news_stocks.set_index('Date')
    
    train_size = int(len(news_stocks) * 0.80)
    test_size = len(news_stocks) - train_size
    train, test = news_stocks.iloc[:train_size], news_stocks.iloc[train_size:]
    print(stock, ':\t',len(train), len(test))    
    df_train.append(train)
    df_test.append(test)
    

df_train = pd.concat(df_train, axis=0)
df_test = pd.concat(df_test, axis=0)

len(df_train), len(df_test) 
df_train.head(1)
df_test.head(1)

BANPU :	 172 43



(172, 43)

Unnamed: 0_level_0,Ticker,Text,Close(t+1),Open(t),High(t),Low(t),Close(t),Open(t-1),High(t-1),Low(t-1),Close(t-1),Open(t-2),High(t-2),Low(t-2),Close(t-2),Open(t-3),High(t-3),Low(t-3),Close(t-3)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2014-01-06,BANPU,ดอย เกิด ไร ขึ้น คะ ดอย บ้าน ปู ที่ ลง ทุกวัน ...,26.0,26.0,26.25,24.8,25.25,27.5,28.25,26.5,26.5,30.0,30.25,28.0,28.0,30.0,30.25,29.0,30.25


Unnamed: 0_level_0,Ticker,Text,Close(t+1),Open(t),High(t),Low(t),Close(t),Open(t-1),High(t-1),Low(t-1),Close(t-1),Open(t-2),High(t-2),Low(t-2),Close(t-2),Open(t-3),High(t-3),Low(t-3),Close(t-3)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2017-04-26,BANPU,ราคา เท่ากัน ตัว ไหน น่าสนใจ กว่า ช่วงนี้ กับ ...,19.5,19.5,19.7,19.0,19.3,20.6,20.6,19.3,19.5,20.8,20.9,20.4,20.6,20.6,20.8,20.5,20.7


In [25]:
# df_train.to_csv('data/pantip_train_(t-3).csv', index=False)
# df_test.to_csv('data/pantip_test_(t-3).csv', index=False)

# TF-IDF Vetorization

In [38]:
stop_words = stopwords.words('thai')
stop_words.remove('ขึ้น')
stop_words.remove('ลง')

vertorizer = TfidfVectorizer(stop_words=stop_words, 
                             max_df=0.9, 
                             min_df=2, 
                             max_features=500)

tfidf_train = vertorizer.fit_transform(df_train['Text'])
tfidf_test = vertorizer.transform(df_test['Text'])

df_tfidf_train = pd.DataFrame.from_records(tfidf_train.toarray())
df_tfidf_test = pd.DataFrame.from_records(tfidf_test.toarray())

df_tfidf_train = df_tfidf_train.set_index(df_train.index)
df_tfidf_test = df_tfidf_test.set_index(df_test.index)

len(df_tfidf_train), len(df_tfidf_test)

# replace Text with TF-IDF vector
x_train = df_train.drop(['Text'], axis=1)
x_train = pd.concat([x_train, df_tfidf_train], axis=1)

x_test = df_test.drop(['Text'], axis=1)
x_test = pd.concat([x_test, df_tfidf_test], axis=1)

# Label Encoding
le = preprocessing.LabelEncoder()
x_train['Ticker'] = le.fit_transform(x_train['Ticker'])
x_test['Ticker'] = le.transform(x_test['Ticker'])
x_train.head(2)
x_test.head(2)
le.classes_

(172, 43)

Unnamed: 0_level_0,Ticker,Close(t+1),Open(t),High(t),Low(t),Close(t),Open(t-1),High(t-1),Low(t-1),Close(t-1),...,490,491,492,493,494,495,496,497,498,499
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2014-01-06,0,26.0,26.0,26.25,24.8,25.25,27.5,28.25,26.5,26.5,...,0.0,0.0,0.0,0.099438,0.0,0.0,0.062293,0.0,0.0,0.0
2014-01-07,0,26.5,25.0,26.5,25.0,26.0,26.0,26.25,24.8,25.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0_level_0,Ticker,Close(t+1),Open(t),High(t),Low(t),Close(t),Open(t-1),High(t-1),Low(t-1),Close(t-1),...,490,491,492,493,494,495,496,497,498,499
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-04-26,0,19.5,19.5,19.7,19.0,19.3,20.6,20.6,19.3,19.5,...,0.0,0.0,0.0,0.075259,0.0,0.0,0.0,0.0,0.0,0.0
2017-05-10,0,17.9,19.2,19.3,18.4,18.5,19.2,19.4,19.1,19.2,...,0.0,0.0,0.0,0.135617,0.0,0.038058,0.0,0.0,0.0,0.0


array(['BANPU'], dtype=object)

# Create x_train and y_train

In [39]:
Horizon = 'Close(t+1)'
y_train = x_train[[Horizon]]
x_train = x_train.drop(['Close(t+1)'], axis=1).copy()
x_train.shape, y_train.shape

((172, 517), (172, 1))

# Evaluate Each Stcok

In [28]:
def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [29]:
def evaluator(clf, df_test, le, isXGB=False, isLSTM=False):
    RMSEs, MAEs, MAPEs, DAs = [], [], [], []
    results = []
    for stock in target_stocks:
        x_tmp = df_test.loc[df_test['Ticker'] == le.transform([stock])[0]].copy()
        
        y_tmp = x_tmp[Horizon].values
        
        # Directional Accuracy
        changes = x_tmp[Horizon] -  x_tmp['Close(t)']
        y_true_da = []
        for change in changes:
            y_true_da.append(1 if change >= 0 else 0)
                
        x_tmp = x_tmp.drop(['Close(t+1)'], axis=1)
        
        if isXGB:
            y_pred = clf.predict(xgboost.DMatrix(x_tmp))
        elif isLSTM:
            x = x_tmp.values
            x = x.reshape((x.shape[0], x.shape[1], 1))
            y_pred = clf.predict(x)
        else:
            y_pred = clf.predict(x_tmp.as_matrix())
        
        # Directional Accuracy Pred
        changes = np.reshape(y_pred, (-1,1)) -  np.reshape(x_tmp['Close(t)'],(-1,1))
        y_pred_da = []
        for change in changes:
            y_pred_da.append(1 if change >= 0 else 0)
        
        RMSE = np.sqrt(mean_squared_error(y_tmp, y_pred))
        MAE = mean_absolute_error(y_tmp, y_pred)
        MAPE = mean_absolute_percentage_error(y_tmp, y_pred)
        DA = accuracy_score(y_true_da, y_pred_da)
        print(stock, "\tRMSE: %.2f\t MAE: %.2f \tMAPE: %.2f \tDA: %.2f" % (RMSE, MAE, MAPE, DA))
        RMSEs.append(RMSE)
        MAEs.append(MAE)
        MAPEs.append(MAPE)
        DAs.append(DA)
    
    print('\nmean RMSE:', round(np.mean(RMSEs),2))
    print('mean MAE:', round(np.mean(MAEs),2))
    print('mean MAPE:', round(np.mean(MAPEs),2))
    print('mean DA:', round(np.mean(DAs),4))

In [30]:
def ensemble_evaluator(dt, rf, ada_dt, ada_rf, gb, xgb, df_test, le):
    RMSEs, MAEs, MAPEs, DAs = [], [], [], []
    results = []
    for stock in target_stocks:
        x_tmp = df_test.loc[df_test['Ticker'] == le.transform([stock])[0]].copy()
        
        y_tmp = x_tmp[Horizon].values
        
        # Directional Accuracy
        changes = x_tmp[Horizon] -  x_tmp['Close(t)']
        y_true_da = []
        for change in changes:
            y_true_da.append(1 if change >= 0 else 0)
                
        x_tmp = x_tmp.drop(['Close(t+1)'], axis=1)
        
        # Prediction
        
        y_dt = dt.predict(x_tmp.as_matrix())
        y_rf = rf.predict(x_tmp.as_matrix())
        y_ada_dt = ada_dt.predict(x_tmp.as_matrix())
        y_ada_rf = ada_rf.predict(x_tmp.as_matrix())
        y_gb = gb.predict(x_tmp.as_matrix())
        
        y_xgb = xgb.predict(xgboost.DMatrix(x_tmp))
        
#             x = x_tmp.values
#             x = x.reshape((x.shape[0], 1, x.shape[1]))
#             y_pred = clf.predict(x)
        y_pred = (y_dt+y_rf+y_ada_dt+y_ada_rf+y_gb+y_xgb)/6

        # Directional Accuracy Pred
        changes = np.reshape(y_pred, (-1,1)) -  np.reshape(x_tmp['Close(t)'],(-1,1))
        y_pred_da = []
        for change in changes:
            y_pred_da.append(1 if change >= 0 else 0)
        
        RMSE = np.sqrt(mean_squared_error(y_tmp, y_pred))
        MAE = mean_absolute_error(y_tmp, y_pred)
        MAPE = mean_absolute_percentage_error(y_tmp, y_pred)
        DA = accuracy_score(y_true_da, y_pred_da)
        print(stock, "\tRMSE: %.2f\t MAE: %.2f \tMAPE: %.2f \tDA: %.2f" % (RMSE, MAE, MAPE, DA))
        RMSEs.append(RMSE)
        MAEs.append(MAE)
        MAPEs.append(MAPE)
        DAs.append(DA)
    
    print('\nmean RMSE:', round(np.mean(RMSEs),2))
    print('mean MAE:', round(np.mean(MAEs),2))
    print('mean MAPE:', round(np.mean(MAPEs),2))
    print('mean DA:', round(np.mean(DAs),4))

# Ensemble

In [20]:
ensemble_evaluator(decis_tree_regr, 
                   rnd_forest_regr, 
                   adaboost_dt_regr, 
                   adaboost_rf_regr, 
                   gbr, 
                   xgb,
                   x_test, le)

BANPU 	RMSE: 0.32	 MAE: 0.21 	MAPE: 1.16 	DA: 0.62

mean RMSE: 0.32
mean MAE: 0.21
mean MAPE: 1.16
mean DA: 0.625



reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead



# Linear Regression

In [40]:
from sklearn import linear_model

lineregr = linear_model.LinearRegression()
lineregr.fit(x_train, y_train)

evaluator(lineregr, x_test, le)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

BANPU 	RMSE: 0.79	 MAE: 0.61 	MAPE: 12.97 	DA: 0.53

mean RMSE: 0.79
mean MAE: 0.61
mean MAPE: 12.97
mean DA: 0.5349



reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead



# Support Vector Regressor

In [41]:
from sklearn.svm import SVR
svr = SVR()
svr.fit(x_train, y_train)

evaluator(svr, x_test, le)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

BANPU 	RMSE: 0.59	 MAE: 0.46 	MAPE: 2.41 	DA: 0.53

mean RMSE: 0.59
mean MAE: 0.46
mean MAPE: 2.41
mean DA: 0.5349



reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead



# Decistion Tree Regressor

In [42]:
from sklearn import tree

decis_tree_regr = tree.DecisionTreeRegressor()
decis_tree_regr.fit(x_train, y_train)
evaluator(decis_tree_regr, x_test, le)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

BANPU 	RMSE: 1.10	 MAE: 0.73 	MAPE: 3.62 	DA: 0.47

mean RMSE: 1.1
mean MAE: 0.73
mean MAPE: 3.62
mean DA: 0.4651



reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead



# Random Forest Regrssor

In [43]:
from sklearn import ensemble

rnd_forest_regr = ensemble.RandomForestRegressor(n_jobs=-1)
rnd_forest_regr.fit(x_train, y_train)

evaluator(rnd_forest_regr, x_test, le)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

BANPU 	RMSE: 0.73	 MAE: 0.55 	MAPE: 2.84 	DA: 0.53

mean RMSE: 0.73
mean MAE: 0.55
mean MAPE: 2.84
mean DA: 0.5349



reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead



# AdaBoost Regressor

In [44]:
adaboost_dt_regr = ensemble.AdaBoostRegressor(base_estimator=tree.DecisionTreeRegressor(),
                                           learning_rate=1, 
                                           n_estimators=50, 
                                           loss='linear')
adaboost_dt_regr.fit(x_train, y_train)

evaluator(adaboost_dt_regr, x_test, le)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best'),
         learning_rate=1, loss='linear', n_estimators=50,
         random_state=None)

BANPU 	RMSE: 0.78	 MAE: 0.52 	MAPE: 2.63 	DA: 0.58

mean RMSE: 0.78
mean MAE: 0.52
mean MAPE: 2.63
mean DA: 0.5814



reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead



In [45]:
adaboost_rf_regr = ensemble.AdaBoostRegressor(base_estimator=ensemble.RandomForestRegressor(n_jobs=-1),
                                           learning_rate=1, 
                                           n_estimators=50, 
                                           loss='linear')
adaboost_rf_regr.fit(x_train, y_train)

evaluator(adaboost_rf_regr, x_test, le)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



AdaBoostRegressor(base_estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
         learning_rate=1, loss='linear', n_estimators=50,
         random_state=None)

BANPU 	RMSE: 0.83	 MAE: 0.59 	MAPE: 3.00 	DA: 0.40

mean RMSE: 0.83
mean MAE: 0.59
mean MAPE: 3.0
mean DA: 0.3953



reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead



# Gradient Boosting Regressor

In [46]:
gbr = ensemble.GradientBoostingRegressor(n_estimators=500, 
                                         learning_rate=0.1,
                                         max_depth=4,
                                         min_samples_split=2,
                                         loss='ls',
                                        )
gbr.fit(x_train, y_train)

evaluator(gbr, x_test, le)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=4, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=500, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

BANPU 	RMSE: 0.80	 MAE: 0.60 	MAPE: 3.08 	DA: 0.35

mean RMSE: 0.8
mean MAE: 0.6
mean MAPE: 3.08
mean DA: 0.3488



reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead



# XGBoost Regressor

In [9]:
import xgboost
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.15, random_state=10)
len(x_train), len(x_valid)

d_train = xgboost.DMatrix(x_train, label=y_train)
d_valid = xgboost.DMatrix(x_valid, label=y_valid)

(816, 144)

In [18]:
params = {
    'booster':'dart',
    'max_depth': 8,
    'learning_rate': 0.01,
    'n_estimators':300,
    'subsample': 0.9,
    'objective': 'reg:tweedie',
    'eval_metric': 'mae',
    'reg_lambda': 0.8,
    'reg_alpha': 0.2,
    'silent': 1,
}

xgb = xgboost.train(params, d_train, 
                    num_boost_round=5000, 
                    evals=[(d_train, 'train'), (d_valid, 'valid')], 
                    early_stopping_rounds=50,
                    verbose_eval=100
                   )

[0]	train-mae:17.8789	valid-mae:18.6906
Multiple eval metrics have been passed: 'valid-mae' will be used for early stopping.

Will train until valid-mae hasn't improved in 50 rounds.
[100]	train-mae:13.048	valid-mae:13.7789
[200]	train-mae:6.7639	valid-mae:7.22708
[300]	train-mae:2.8495	valid-mae:3.07734
[400]	train-mae:1.11178	valid-mae:1.2272
[500]	train-mae:0.44533	valid-mae:0.564752
[600]	train-mae:0.218248	valid-mae:0.378043
[700]	train-mae:0.150215	valid-mae:0.347583
[800]	train-mae:0.120847	valid-mae:0.344008
Stopping. Best iteration:
[797]	train-mae:0.121591	valid-mae:0.343887



In [19]:
evaluator(xgb, x_test, le, isXGB=True)

BANPU 	RMSE: 0.38	 MAE: 0.24 	MAPE: 1.29 	DA: 0.57

mean RMSE: 0.38
mean MAE: 0.24
mean MAPE: 1.29
mean DA: 0.5708



reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead



In [30]:
# xgboost.plot_importance(xgb, height=0.3)

# LSTM

In [11]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM

In [12]:
train_X = x_train.values
val_X = x_valid.values
# test_X = x_test.values

train_y = y_train.values
val_y = y_valid.values
# test_y = y_test.values

train_X = train_X.reshape(train_X.shape[0], train_X.shape[1], 1)
val_X = val_X.reshape(val_X.shape[0], val_X.shape[1], 1)
# test_X = test_X.reshape((test_X.shape[0], 1, test_X.shape[1]))

print(train_X.shape, train_y.shape)
print(val_X.shape, val_y.shape)
# print(test_X.shape, test_y.shape)

(816, 517, 1) (816, 1)
(144, 517, 1) (144, 1)


In [14]:
model = Sequential()
model.add(LSTM(64, return_sequences=True, input_shape=(train_X.shape[1], 1)))
model.add(Dropout(0.3))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.3))
# model.add(LSTM(256, return_sequences=True))
# model.add(Dropout(0.3))
# model.add(LSTM(256, return_sequences=True))
# model.add(Dropout(0.4))
model.add(LSTM(128))
model.add(Dense(1))
model.compile(loss='mean_absolute_error', optimizer='adam')


inspect.getargspec() is deprecated, use inspect.signature() instead


inspect.getargspec() is deprecated, use inspect.signature() instead


inspect.getargspec() is deprecated, use inspect.signature() instead


inspect.getargspec() is deprecated, use inspect.signature() instead


inspect.getargspec() is deprecated, use inspect.signature() instead


inspect.getargspec() is deprecated, use inspect.signature() instead


inspect.getargspec() is deprecated, use inspect.signature() instead


inspect.getargspec() is deprecated, use inspect.signature() instead



In [15]:
from keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping

checkpoint = ModelCheckpoint(filepath="model/LSTM.h5", 
                             monitor='val_loss', 
                             verbose=1, 
                             save_best_only=True, 
                             save_weights_only=False, 
                             mode='auto', 
                             period=1
                            )

earlystopping = EarlyStopping(monitor='val_loss', 
                      min_delta=0, 
                      patience=100,
                      verbose=1, 
                      mode='auto')

# access via $ tensorboard --logdir=./logs
tensorboard = TensorBoard(log_dir='./logs')

In [18]:
model.fit(x=train_X, 
          y=train_y,
          epochs=1000,
          batch_size=32,
          validation_data=(val_X, val_y),
          verbose=1,
          shuffle=False,
          callbacks=[checkpoint, earlystopping, tensorboard]
         )

Train on 816 samples, validate on 144 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000

KeyboardInterrupt: 

In [21]:
evaluator(model, x_test, le, isLSTM=True)

BANPU 	RMSE: 2.10	 MAE: 1.98 	MAPE: 11.19 	DA: 0.60

mean RMSE: 2.1
mean MAE: 1.98
mean MAPE: 11.19
mean DA: 0.5958



reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead



In [27]:
from keras.layers import Bidirectional

In [33]:
model = Sequential()
model.add(Bidirectional(LSTM(128, return_sequences=True), input_shape=(2, train_X.shape[2])))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.4))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(1))
model.compile(loss='mean_absolute_error', optimizer='adam')


inspect.getargspec() is deprecated, use inspect.signature() instead


inspect.getargspec() is deprecated, use inspect.signature() instead


inspect.getargspec() is deprecated, use inspect.signature() instead


inspect.getargspec() is deprecated, use inspect.signature() instead


inspect.getargspec() is deprecated, use inspect.signature() instead


inspect.getargspec() is deprecated, use inspect.signature() instead


inspect.getargspec() is deprecated, use inspect.signature() instead


inspect.getargspec() is deprecated, use inspect.signature() instead


inspect.getargspec() is deprecated, use inspect.signature() instead


inspect.getargspec() is deprecated, use inspect.signature() instead


inspect.getargspec() is deprecated, use inspect.signature() instead


inspect.getargspec() is deprecated, use inspect.signature() instead


inspect.getargspec() is deprecated, use inspect.signature() instead


inspect.getargspec() is deprecated, use inspect.signature() instead


inspect.getargspec(

In [34]:
model.fit(x=train_X, 
          y=train_y,
          epochs=10000,
          batch_size=1024,
          validation_data=(val_X, val_y),
          verbose=1,
          shuffle=False,
          callbacks=[checkpoint, earlystopping, tensorboard]
         )

ValueError: Error when checking input: expected bidirectional_11_input to have shape (None, 2, 517) but got array with shape (816, 1, 517)

In [8]:
'hey'

'hey'

In [30]:
evaluator(model, x_test, le, isLSTM=True)

BANPU 	RMSE: 2.11	 MAE: 1.99 	MAPE: 11.29 	DA: 0.60

mean RMSE: 2.11
mean MAE: 1.99
mean MAPE: 11.29
mean DA: 0.5958



reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead



In [37]:
data_dim = 16
timesteps = 8
num_classes = 10
np.random.random((10, timesteps, data_dim)).shape

(10, 8, 16)

In [43]:
train_X = x_train.values
val_X = x_valid.values
train_X.shape

train_X = train_X.reshape(train_X.shape[0], train_X.shape[1], 1)
train_X.shape

(816, 517)

(816, 517, 1)