In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [2]:
import numpy as np
import pandas as pd
import time
from datetime import datetime
from pymongo import MongoClient

from src.config_tickets import ticket_lst
from src.scraping import WebScraping
from src.settings import HOST
from src.query_data import QueryData
from src.utilities import run_model_with_parameters, run_model_without_parameters

import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

Using TensorFlow backend.


In [3]:
# Display and Plotting
import matplotlib.pylab as plt
import seaborn as sns

from ipywidgets import interactive, widgets, RadioButtons, ToggleButton, Select, FloatSlider, FloatRangeSlider, IntSlider, fixed

pd.set_option('display.float_format', lambda x: '%.5f' % x) # pandas
np.set_printoptions(precision=5, suppress=True) # numpy

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

# seaborn plotting style
sns.set(style='ticks', context='poster')

In [4]:
def connect_2_dbServer():
    mongoClient = MongoClient(HOST)
    return mongoClient

In [None]:
client = connect_2_dbServer()
query = QueryData(client)

In [None]:
lst_ticket = query.get_list_ticket(index='VN 30 (VNI30)')
print(lst_ticket)
print(len(lst_ticket))

In [None]:
start = datetime(2015,1,1)
end = datetime(2019,1,1)

print(lst_ticket[:5])

df = query.get_historical_data(lst_ticket, start=start)
closed_price = dict()
for ticket in lst_ticket[3:10]:
    closed_price[ticket] = df[df.name == ticket]['close']

series = df[df.name == 'Mobile World Investment Corp']['close']
plt.figure(figsize=(16,8))
plt.plot(series)
plt.show()

## Plot ACF and PACF

In [None]:
from statsmodels.tsa.stattools import pacf, acf

def calculate_acf(time_series, lag=20, alpha=0.01):
    x = time_series.values
    acf_value, confint = acf(x, nlags=lag, alpha=alpha)
    confint_lower = confint[:, 0] - acf_value
    confint_upper = confint[:, 1] - acf_value
    return acf_value, confint_upper, confint_lower


def calculate_pacf(time_series, lag=20, alpha=0.01):
    x = time_series.values
    pacf_value, confint = pacf(x, nlags=lag, alpha=alpha)
    confint_lower = confint[:, 0] - pacf_value
    confint_upper = confint[:, 1] - pacf_value
    return pacf_value, confint_upper, confint_lower

## Run model

### ARIMA model

In [None]:
start_train = 0.8
end_train = 0.95
# for ticket in lst_ticket[5:6]:
#     time_series = df[df.name == ticket]['close']
#     # split data
#     time_series = time_series.sort_index()
#     size = len(time_series)
#     train_start = int(start_train * size)
#     train_end = int(end_train * size)
#     train, test = time_series[train_start:train_end], time_series[train_end:]
    
#     result = run_model_with_parameters(train, test, model_selection='ARIMA', order=(1, 1, 2))
#     if result['status']:
#         validation = result['model'].validate(test)
#         time.sleep(1)
#         print(ticket)
#         print('Training result:', result['train_evaluation'])
#         print('Testing result:', result['test_evaluation'])
#         fig, ax1 = plt.subplots(nrows=1, ncols=1, figsize=(15, 8))
#         ax1.plot(train, label='in-sample', color='r', linestyle='-')
#         ax1.plot(test, label='held-out', color='g', linestyle='--')
#         ax1.plot(validation, label='predicted held-out', color='y', linestyle='--')
#     #     ax1.plot(df_ann_validation['yhat'], label='validation held-out', color='purple', linestyle='--')
#         plt.show()
#     else:
#         print('None')

### ANN model

In [None]:
# for ticket in lst_ticket[3:4]:
#     time_series = df[df.name == ticket]['close']
    
#     # split data
#     time_series = time_series.sort_index()
#     size = len(time_series)
#     train_start = int(start_train * size)
#     train_end = int(end_train * size)
#     train, test = time_series[train_start:train_end], time_series[train_end:]
    
#     result = run_model_with_parameters(train, test, model_selection='ANN', lag=1, hidden_layers=(3,1))
#     if result['status']:
#         validation = result['model'].validate(test)
#         time.sleep(1)
#         print(ticket)
#         print('Training result:', result['train_evaluation'])
#         print('Testing result:', result['test_evaluation'])
#         fig, ax1 = plt.subplots(nrows=1, ncols=1, figsize=(15, 8))
#         ax1.plot(train, label='in-sample', color='r', linestyle='-')
#         ax1.plot(test, label='held-out', color='g', linestyle='--')
#         ax1.plot(validation, label='predicted held-out', color='y', linestyle='--')
#     #     ax1.plot(df_ann_validation['yhat'], label='validation held-out', color='purple', linestyle='--')
#         plt.show()
#     else:
#         print('None')

### Hybrid model

In [None]:
# for ticket in lst_ticket[3:4]:
#     time_series = df[df.name == ticket]['close']
    
#     # split data
#     time_series = time_series.sort_index()
#     size = len(time_series)
#     train_start = int(start_train * size)
#     train_end = int(end_train * size)
#     train, test = time_series[train_start:train_end], time_series[train_end:]
    
#     result = run_model_with_parameters(train, test, model_selection='Hybrid', order=(1, 1, 0), lag=2, hidden_layers=(6, 3))
#     if result['status']:
#         validation = result['model'].validate(test)
#         time.sleep(1)
#         print(ticket)
#         print('Training result:', result['train_evaluation'])
#         print('Testing result:', result['test_evaluation'])
#         fig, ax1 = plt.subplots(nrows=1, ncols=1, figsize=(15, 8))
#         ax1.plot(train, label='in-sample', color='r', linestyle='-')
#         ax1.plot(test, label='held-out', color='g', linestyle='--')
#         ax1.plot(validation, label='predicted held-out', color='y', linestyle='--')
#     #     ax1.plot(df_ann_validation['yhat'], label='validation held-out', color='purple', linestyle='--')
#         plt.show()
#     else:
#         print('None')

### Run model without parameters

In [None]:
lst_hybrid = []
for ticket in lst_ticket:
    time_series = df[df.name == ticket]['close']
    
    # split data
    time_series = time_series.sort_index()
    size = len(time_series)
    train_start = int(start_train * size)
    train_end = int(end_train * size)
    train, test = time_series[train_start:train_end], time_series[train_end:]
    
    result, lst_result = run_model_without_parameters(train, test, model_selection='Hybrid', q=range(0, 4))
    print(ticket, ':', result['lag'], result['hidden_layers'], result['order'])
    print('Training result:', result['train_evaluation'])
    print('Testing result:', result['test_evaluation'])
    lst_hybrid.append({
        'name': ticket,
        'order': result['order'],
        'lags': result['lag'],
        'hidden_layers': result['hidden_layers'],
        'result': result['test_evaluation']
    })
    validation = result['model'].validate(test)
    fig, ax1 = plt.subplots(nrows=1, ncols=1, figsize=(15, 8))
    ax1.plot(train, label='in-sample', color='r', linestyle='-')
    ax1.plot(test, label='held-out', color='g', linestyle='--')
    ax1.plot(validation, label='predicted held-out', color='y', linestyle='--')
    plt.xticks(rotation=70)
    plt.ylabel('Price (VND)')
    plt.legend(['in-sample','held-out', 'prediction'])
    plt.title('Model evaluation of ' + ticket)
    plt.show()

In [None]:
lst_arima = []
for ticket in lst_ticket:
    time_series = df[df.name == ticket]['close']
    
    # split data
    time_series = time_series.sort_index()
    size = len(time_series)
    train_start = int(start_train * size)
    train_end = int(end_train * size)
    train, test = time_series[train_start:train_end], time_series[train_end:]
    
    result, lst_result = run_model_without_parameters(train, test, model_selection='ARIMA', q=range(0, 4))
    print(ticket, ':', result['order'])
    print('Training result:', result['train_evaluation'])
    print('Testing result:', result['test_evaluation'])
    lst_arima.append({
        'name': ticket,
        'order': result['order'],
#         'lags': result['lag'],
#         'hidden_layers': result['hidden_layers'],
        'result': result['test_evaluation']
    })
    validation = result['model'].validate(test)
    fig, ax1 = plt.subplots(nrows=1, ncols=1, figsize=(15, 8))
    ax1.plot(train, label='in-sample', color='r', linestyle='-')
    ax1.plot(test, label='held-out', color='g', linestyle='--')
    ax1.plot(validation, label='predicted held-out', color='y', linestyle='--')
    plt.xticks(rotation=70)
    plt.ylabel('Price (VND)')
    plt.legend(['in-sample','held-out', 'prediction'])
    plt.title('Model evaluation of ' + ticket)
    plt.show()

In [None]:
lst_ann = []
for ticket in lst_ticket:
    time_series = df[df.name == ticket]['close']
    
    # split data
    time_series = time_series.sort_index()
    size = len(time_series)
    train_start = int(start_train * size)
    train_end = int(end_train * size)
    train, test = time_series[train_start:train_end], time_series[train_end:]
    
    result, lst_result = run_model_without_parameters(train, test, model_selection='ANN', q=range(0, 4))
    print(ticket, ':', result['lag'], result['hidden_layers'])
    print('Training result:', result['train_evaluation'])
    print('Testing result:', result['test_evaluation'])
    lst_ann.append({
        'name': ticket,
#         'order': result['order'],
        'lags': result['lag'],
        'hidden_layers': result['hidden_layers'],
        'result': result['test_evaluation']
    })
    validation = result['model'].validate(test)
    fig, ax1 = plt.subplots(nrows=1, ncols=1, figsize=(15, 8))
    ax1.plot(train, label='in-sample', color='r', linestyle='-')
    ax1.plot(test, label='held-out', color='g', linestyle='--')
    ax1.plot(validation, label='predicted held-out', color='y', linestyle='--')
    plt.xticks(rotation=70)
    plt.ylabel('Price (VND)')
    plt.legend(['in-sample','held-out', 'prediction'])
    plt.title('Model evaluation of ' + ticket)
    plt.show()

In [None]:
lst_lstm_gbm = list()
for ticket in lst_ticket:
    time_series = df[df.name == ticket]['close']
    # split data
    time_series = time_series.sort_index()
    size = len(time_series)
    train_start = int(start_train * size)
    train_end = int(end_train * size)
    train, test = time_series[train_start:train_end], time_series[train_end:]
    print(ticket)
    result = run_model_with_parameters(train, test, model_selection='LSTM_GBM', window_size=10, lag=10)
    if result['status']:
        validation = result['model'].validate(test)
        insample_prediction = result['model'].get_insample_prediction()
        time.sleep(1)
        
        print('Training result:', result['train_evaluation'])
        print('Testing result:', result['test_evaluation'])
        
        lst_lstm_gbm.append({
        'name': ticket,
#         'order': result['order'],
#         'lags': result['lag'],
#         'hidden_layers': result['hidden_layers'],
        'result': result['test_evaluation']
    })
        
        fig, ax1 = plt.subplots(nrows=1, ncols=1, figsize=(15, 8))
        ax1.plot(train, label='in-sample', color='r', linestyle='-')
        ax1.plot(test, label='held-out', color='g', linestyle='--')
        ax1.plot(validation, label='predicted held-out', color='y', linestyle='--')
        plt.xticks(rotation=70)
        plt.ylabel('Price (VND)')
        plt.legend(['in-sample','held-out', 'prediction'])
        plt.title('Model evaluation of ' + ticket)
        plt.show()
    else:
        print('None')

In [None]:
# print(result['test_evaluation'])
# lst_dct_result = list()
# for result_ in lst_result:
#     lst_dct_result.append({
#         'order': result_['order'],
#         'mse': result_['test_evaluation']['mse'],
#         'rmse': result_['test_evaluation']['rmse'],
#         'mae': result_['test_evaluation']['mae'],
#         'mape': result_['test_evaluation']['mape'],
#     })
# df_result = pd.DataFrame(data=lst_dct_result)
# df_result.head(20)

In [None]:
df_mae = pd.DataFrame(index=lst_ticket)
df_mae['hybrid_order'] = [item['order'] for item in lst_hybrid]
df_mae['hybrid_lags'] = [item['lags'] for item in lst_hybrid]
df_mae['hybrid_hl'] = [item['hidden_layers'] for item in lst_hybrid]
df_mae['hybrid'] = [item['result']['mae'] for item in lst_hybrid]

df_mae['arima_order'] = [item['order'] for item in lst_arima]
df_mae['arima'] = [item['result']['mae'] for item in lst_arima]

df_mae['ann_lags'] = [item['lags'] for item in lst_ann]
df_mae['ann_hl'] = [item['hidden_layers'] for item in lst_ann]
df_mae['ann'] = [item['result']['mae'] for item in lst_ann]

df_mae['lsmt_gbm'] = [item['result']['mae'] for item in lst_lstm_gbm]
df_mae.to_csv('mae.csv')
df_mae

In [None]:
df_mse = pd.DataFrame(index=lst_ticket)
df_mse['hybrid_order'] = [item['order'] for item in lst_hybrid]
df_mse['hybrid_lags'] = [item['lags'] for item in lst_hybrid]
df_mse['hybrid_hl'] = [item['hidden_layers'] for item in lst_hybrid]
df_mse['hybrid'] = [item['result']['mse'] for item in lst_hybrid]

df_mse['arima_order'] = [item['order'] for item in lst_arima]
df_mse['arima'] = [item['result']['mse'] for item in lst_arima]

df_mse['ann_lags'] = [item['lags'] for item in lst_ann]
df_mse['ann_hl'] = [item['hidden_layers'] for item in lst_ann]
df_mse['ann'] = [item['result']['mse'] for item in lst_ann]

df_mse['lsmt_gbm'] = [item['result']['mse'] for item in lst_lstm_gbm]

df_mse.to_csv('mse.csv')

In [None]:
df_rmse = pd.DataFrame(index=lst_ticket)
df_rmse['hybrid_order'] = [item['order'] for item in lst_hybrid]
df_rmse['hybrid_lags'] = [item['lags'] for item in lst_hybrid]
df_rmse['hybrid_hl'] = [item['hidden_layers'] for item in lst_hybrid]
df_rmse['hybrid'] = [item['result']['rmse'] for item in lst_hybrid]

df_rmse['arima_order'] = [item['order'] for item in lst_arima]
df_rmse['arima'] = [item['result']['rmse'] for item in lst_arima]

df_rmse['ann_lags'] = [item['lags'] for item in lst_ann]
df_rmse['ann_hl'] = [item['hidden_layers'] for item in lst_ann]
df_rmse['ann'] = [item['result']['rmse'] for item in lst_ann]

df_rmse['lsmt_gbm'] = [item['result']['rmse'] for item in lst_lstm_gbm]

df_rmse.to_csv('rmse.csv')

In [None]:
df_mape = pd.DataFrame(index=lst_ticket)
df_mape['hybrid_order'] = [item['order'] for item in lst_hybrid]
df_mape['hybrid_lags'] = [item['lags'] for item in lst_hybrid]
df_mape['hybrid_hl'] = [item['hidden_layers'] for item in lst_hybrid]
df_mape['hybrid'] = [item['result']['mape'] for item in lst_hybrid]

df_mape['arima_order'] = [item['order'] for item in lst_arima]
df_mape['arima'] = [item['result']['mape'] for item in lst_arima]

df_mape['ann_lags'] = [item['lags'] for item in lst_ann]
df_mape['ann_hl'] = [item['hidden_layers'] for item in lst_ann]
df_mape['ann'] = [item['result']['mape'] for item in lst_ann]

df_mape['lsmt_gbm'] = [item['result']['mape'] for item in lst_lstm_gbm]

df_mape.to_csv('mse.csv')