In [1]:
import tensorflow as tf
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import mplfinance as mpf
import talib as talib
import numpy as np
import data as ds
import common as common
import os as os
import math as math
import datetime as datetime
import scipy as sp
import itertools  as itertools
import multiprocessing as mp
from os import listdir, walk
from itertools import repeat
from mplfinance.original_flavor import candlestick_ohlc
from datetime import datetime, timedelta
from tensorflow.keras.layers import LSTM, Dropout, Dense
from tensorflow.keras.optimizers import SGD
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score, pairwise, mean_squared_error, mean_absolute_error
from scipy import stats
from pprint import pprint

In [2]:
# 0.1 环境设定
pd.options.mode.chained_assignment = None

# 0.2 不让程序占满 GPU 内存
gpus = tf.config.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [3]:
# 1.0 重構數據集
def reshape_dataframe(df0):
    df0.fillna(0, inplace=True)
    df0.replace([np.inf, -np.inf], 0, inplace=True)
    df0.columns = ['udate', 'High', 'Low', 'Open', 'Close', 'Volume']
    types2 = {'udate': 'object', 'High': 'float64', 'Low': 'float64', 'Open': 'float64', 'Close': 'float64', 'Volume': 'int64'}
    df0.astype(types2).dtypes
    df0_1 = df0.copy(deep=True)
    error_row = []
    for k, v in df0.iterrows():
        if not pd.isnull(df0['udate'].iloc[k]) and df0['udate'].iloc[k] > 0:
            stime = str(int(df0['udate'].iloc[k]))
            df0_1['udate'].iloc[k] = datetime(year=2020, month=int(stime[-8:-6]), day=int(stime[-6:-4]), hour=int(stime[-4:-2]), minute=int(stime[-2:]), second=0)
        else:
            error_row.append(k)
    df0_1.drop(df0_1.index[error_row], inplace=True)
    df0_1.udate = pd.to_datetime(df0_1.udate)
    df0_1.index = pd.to_datetime(df0_1.udate)
    
    # 1.0.1 數據有效性檢查
    for k, v in types2.items():
        if (df0_1[k].isin([np.nan]).any().any()):
            print(k+' obtains nan')
        if (df0_1[k].isin([0]).any().any()):
            print(k+' obtains 0')
    is_contain_null = df0_1.isnull().sum()

    return df0_1

# 1.1 数据源
files = []
file_1 = os.path.abspath(os.path.join('data', 'nq', 'data', 'nq-20201105.csv'))
df1_1 = reshape_dataframe(pd.read_csv(file_1))
files.append(df1_1)

file_2 = os.path.abspath(os.path.join('data', 'nq', 'data', 'nq-20201105_Aug_Sep.csv'))
df1_2 = reshape_dataframe(pd.read_csv(file_2))
files.append(df1_2)

file_3 = os.path.abspath(os.path.join('data', 'nq', 'data', 'nq-20201110.csv'))
df1_3 = reshape_dataframe(pd.read_csv(file_3))
files.append(df1_3)

file_4 = os.path.abspath(os.path.join('data', 'nq', 'data', 'nq-20201111.csv'))
df1_4 = reshape_dataframe(pd.read_csv(file_4))
files.append(df1_4)

for i in range(2, 10):
    month1 = str(i).zfill(2)
    file_5 = os.path.abspath(os.path.join('data', 'nq', 'data', 'nq-20-'+month1+'.csv'))
    df1_5 = reshape_dataframe(pd.read_csv(file_5))
    files.append(df1_5)

df2 = pd.concat(files, ignore_index=False)

# 1.2 刪除重覆index
df2 = df2.groupby(df2.index).first()

# 1.3 排序
df2.sort_index(axis=0, ascending=True, inplace=True)
df2

Volume obtains 0
Volume obtains 0
Volume obtains 0
Volume obtains 0
Volume obtains 0
Volume obtains 0
Volume obtains 0
Volume obtains 0
Volume obtains 0
Volume obtains 0
Volume obtains 0
Volume obtains 0


Unnamed: 0_level_0,udate,High,Low,Open,Close,Volume
udate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-02-16 17:01:00,2020-02-16 17:01:00,9650.00,9657.00,9649.00,9654.00,459.0
2020-02-16 17:02:00,2020-02-16 17:02:00,9654.00,9656.75,9653.00,9653.75,468.0
2020-02-16 17:03:00,2020-02-16 17:03:00,9653.75,9654.00,9653.00,9653.50,168.0
2020-02-16 17:04:00,2020-02-16 17:04:00,9653.50,9657.50,9653.25,9656.50,356.0
2020-02-16 17:05:00,2020-02-16 17:05:00,9656.50,9658.25,9655.50,9657.25,260.0
...,...,...,...,...,...,...
2020-11-10 16:55:00,2020-11-10 16:55:00,11624.00,11624.00,11624.00,11624.00,0.0
2020-11-10 16:56:00,2020-11-10 16:56:00,11624.00,11624.00,11624.00,11624.00,0.0
2020-11-10 16:57:00,2020-11-10 16:57:00,11624.00,11624.00,11624.00,11624.00,0.0
2020-11-10 16:58:00,2020-11-10 16:58:00,11624.00,11624.00,11624.00,11624.00,0.0


In [4]:
# 3.0 分包 (日子)
def separate_daily(df4, return_type):
    data1 = {}
    # 交易日
    days1 = list(dict.fromkeys([v.date() for v in df4['udate']]))
    for day in days1:
        # 交易時間
        day_start = datetime(day.year, day.month, day.day, 17, 0, 0)
        day2 = day + timedelta(days=1)
        day_end = datetime(day2.year, day2.month, day2.day, 16, 0, 0)
        mask = ((df4['udate'] >= day_start) & (df4['udate'] <= day_end))
        df4_1 = df4.loc[mask]
        if (df4_1.shape[0] > 1):
            data1[day] = df4_1
    # 3.1 合併
    _df10 = pd.DataFrame()
    for k, _df11 in data1.items():
        _df10 = pd.concat([_df10, _df11], axis=0, join='outer', ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, copy=True)
    # 3.2 返回類型
    if return_type=='df':
        return _df10
    elif return_type=='dict':
        return data1

df3 = df2.copy(deep=True)
df3 = separate_daily(df3, 'df')
df3

Unnamed: 0_level_0,udate,High,Low,Open,Close,Volume
udate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-02-16 17:01:00,2020-02-16 17:01:00,9650.00,9657.00,9649.00,9654.00,459.0
2020-02-16 17:02:00,2020-02-16 17:02:00,9654.00,9656.75,9653.00,9653.75,468.0
2020-02-16 17:03:00,2020-02-16 17:03:00,9653.75,9654.00,9653.00,9653.50,168.0
2020-02-16 17:04:00,2020-02-16 17:04:00,9653.50,9657.50,9653.25,9656.50,356.0
2020-02-16 17:05:00,2020-02-16 17:05:00,9656.50,9658.25,9655.50,9657.25,260.0
...,...,...,...,...,...,...
2020-11-10 15:56:00,2020-11-10 15:56:00,11619.25,11619.75,11617.50,11619.50,46.0
2020-11-10 15:57:00,2020-11-10 15:57:00,11619.50,11619.50,11613.75,11615.00,116.0
2020-11-10 15:58:00,2020-11-10 15:58:00,11615.00,11618.25,11614.00,11617.75,26.0
2020-11-10 15:59:00,2020-11-10 15:59:00,11617.75,11625.00,11617.75,11624.00,134.0


In [5]:
# 2.0 技術指標
highs = np.array(df3['High'], dtype='float')
lows = np.array(df3['Low'], dtype='float')
opens = np.array(df3['Open'], dtype='float')
closes = np.array(df3['Close'], dtype='float')
vols = np.array(df3['Volume'], dtype='float')
# 2.1 SMA 均線
for v in [5, 10, 20, 50, 100]:
    df3['sma-'+str(v)]= talib.SMA(closes, timeperiod=v)
# 2.2 Bollinger 保力加
df3['upper-band'], df3['middle-band'], df3['lower-band'] = talib.BBANDS(closes, timeperiod=20*1.5, nbdevup=2, nbdevdn=2, matype=0)
# 2.3 %B %保力加
df3['%b'] = (df3['Close']-df3['lower-band'])/(df3['upper-band']-df3['lower-band'])*100
df3['%b-high']  = common.percentB_belowzero(df3['%b'], df3['Close']) 
df3['%b-low'] = common.percentB_aboveone(df3['%b'], df3['Close'])
# 2.4 VOL EMA
df3['vol-ema5'] = talib.EMA(vols, timeperiod=5*4)
# 2.5 P-SAR 抛物线
df3['p-sar'] = talib.SAR(highs, lows, acceleration=0.02, maximum=0.2)
# 2.6 VWAP 成交量加權平均價格
period = [5, 10, 20, 50, 100]
for v in period:
    df3['typical-price'] = (df3['High'] + df3['Low'] + df3['Close']) / 3
    df3['turnover'] = df3['typical-price'] * df3['Volume']
    df3['cum-turnover-'+str(v)] = df3['turnover'].rolling(window=v).sum()
    df3['cum-volume-'+str(v)] = df3['Volume'].rolling(window=v).sum()
    df3['vwap-'+str(v)] = df3['cum-turnover-'+str(v)] / df3['cum-volume-'+str(v)]
    df3['vwap-'+str(v)] = df3['vwap-'+str(v)].replace([np.inf, -np.inf], 0)
    df3['vwap-'+str(v)].fillna(0, inplace=True)
    drop_list_1 = ['turnover', 'typical-price', 'cum-turnover-'+str(v), 'cum-volume-'+str(v)]
    df3.drop(drop_list_1, axis=1, inplace=True)
# 2.7 MACD
df3['macd'], df3['macdsignal'], df3['macdhist'] = talib.MACD(closes, fastperiod=12, slowperiod=26, signalperiod=9*40)
# 2.8 KDJ
df3['k-kdj'], df3['d-kdj'], df3['j-kdj'] = common.kdj(highs, lows, closes, window_size=20)
df3['diff-kdj'] = df3['k-kdj']-df3['d-kdj']
df3['j-kdj'].loc[((df3['j-kdj'] > 20) & (df3['j-kdj'] < 100))] = 0

In [6]:
# 4.0 draw chart
drop_list_2 = []
is_render_chart = False

def draw_worker(df5, day):
    try:
        # 4.1 style
        style = mpf.make_mpf_style(base_mpf_style='charles', rc={'font.size':6})
        # 4.2 addplot
        apds = [mpf.make_addplot(df5['lower-band'],panel=0,color='orange',linestyle='solid'),
                mpf.make_addplot(df5['upper-band'],panel=0,color='bisque',linestyle='solid'),
                mpf.make_addplot(df5['vwap-50'].replace(0, np.nan),panel=0,color='aqua',linestyle='solid'),
                mpf.make_addplot(df5['%b-low'],type='scatter',markersize=20,marker='v',panel=0),
                mpf.make_addplot(df5['%b-high'],type='scatter',markersize=20,marker='^',panel=0),
                mpf.make_addplot(df5['p-sar'],scatter=True,markersize=1,marker='*',panel=0,color='blueviolet'),
                #
                mpf.make_addplot(df5['vol-ema5'],panel=1,color='orange'),
                #
                mpf.make_addplot(df5['macd'],panel=2,color='orange'),
                mpf.make_addplot(df5['macdsignal'],panel=2,color='violet'),
                mpf.make_addplot(df5['macdhist'],panel=2,type='bar',color='dimgray'),
                #
                mpf.make_addplot(df5['k-kdj'],panel=3,color='orange'),
                mpf.make_addplot(df5['d-kdj'],panel=3,color='violet'),
                mpf.make_addplot(df5['j-kdj'],panel=3,color='aqua'),
                mpf.make_addplot(df5['diff-kdj'],panel=3,type='bar',color='dimgray')]
        # 4.3 draw
        mpf.plot(df5, type='candle', addplot=apds, style=style, ylabel='', ylabel_lower='', volume=True, figscale=0.5, xrotation=0, datetime_format="%H:%M", show_nontrading=False, tight_layout=True, savefig='./data/img-nq/features/'+day.strftime('%m-%d-%Y'))
        print('finish draw:', df5.shape, df5['udate'].iloc[0], df5['udate'].iloc[-1])
    except:
        print('do not draw:', k, '\n')

# 4.5 multi processing
pool = mp.Pool(processes=4, maxtasksperchild=4)
# 4.6 draw
df6 = df3.copy(deep=True)
data1 = separate_daily(df6, 'dict')
for k, _df6 in data1.items():
    df6 = _df6.copy(deep=True)
    # 4.7 drop error day
    if df6['Volume'].shape[0] == df6['Volume'].isin([0]).sum() or df6['Volume'].shape[0] <= 100:
        drop_list_2.append(k)
    # 4.8 draw chart
    elif (is_render_chart):
        pool.apply_async(draw_worker, args=(df6, k))
# 4.9 kill multi processing
pool.close()
pool.join()

for k in drop_list_2:
    data1.pop(k, None)
    print('excpet & delete: ', k)

excpet & delete:  2020-02-16
excpet & delete:  2020-02-18
excpet & delete:  2020-02-19
excpet & delete:  2020-02-20
excpet & delete:  2020-09-17
excpet & delete:  2020-10-29


In [7]:
# 5.0 合併
df7 = pd.DataFrame()
for k, df6 in data1.items():
    df7 = pd.concat([df7, df6], axis=0, join='outer', ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, copy=True)

# 5.1 清洗
df8 = df7.copy(deep=True)
drop_list_3 = ['High', 'Low', 'Open', 'Volume', 'macd', 'macdsignal', 'middle-band', 'k-kdj', 'd-kdj', 'vol-ema5', 'sma-10', 'vwap-10']
df8.drop(drop_list_3, axis=1, inplace=True)
df8.fillna(0, inplace=True)
df8 = df8.round(2)

# 5.2 檢查
is_contain_null = df8.isnull().sum()
is_contain_nan = df8.isna().sum()
is_contain_inf = df8.isin([np.nan]).sum()
print('数据集: Total dataset has {} samples, and {} features.'.format(df8.shape[0], df8.shape[1])) # df8.info()

# 5.3 儲存
path_data = os.path.abspath(os.path.join('data', 'nq', 'clean-data', 'nq-clean-data-with-features.csv'))
if os.path.exists(path_data):
    os.remove(path_data)
df8.to_csv(path_data)

# 5.4.1 分包 (日子)
data2 = separate_daily(df8, 'dict')
# 5.3.2 正則化
normalization_days = 10
_days2 = np.array(list(data2.keys()))
len_days2 = (_days2.shape[0]//normalization_days)+1
data3 = []
scalers = {}
for i in range(len_days2):
    try:
        start1, end1 = i*normalization_days, (i+1)*normalization_days
        if end1 >= _days2.shape[0]:
            end1 = _days2.shape[0]-1
        # 5.3.3 每X日集合
        day_start, day_end = _days2[start1], _days2[end1]
        mask = ((df8['udate'].dt.date >= day_start) & (df8['udate'].dt.date <= day_end))
        _df8 = df8[mask]
        # 5.4.4 正則代集合
        min_max_scaler = MinMaxScaler(feature_range=(-0.99, 0.99))
        df10 = _df8.drop(['udate'], axis=1)
        min_max_scaler.fit(df10)
        scalers[i] = {'scaler': min_max_scaler, 'day_start': day_start, 'day_end': day_end}
        df11 = min_max_scaler.transform(df10)
        df11 = pd.DataFrame(df11, columns=df10.columns, index=df10.index)
        df11['udate'] = _df8['udate']
        # 5.4.5 合併
        data3.append(df11)
    except:
        print('except: ', start1, end1, len(_days2))

# 5.4.6 合併
df12 = pd.DataFrame()
for df13 in data3:
    df12 = pd.concat([df12, df13], axis=0, join='outer', ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, copy=True)

数据集: Total dataset has 251077 samples, and 19 features.
0 2020-02-23 2020-03-08
1 2020-03-08 2020-03-22
2 2020-03-22 2020-04-05
3 2020-04-05 2020-04-20
4 2020-04-20 2020-05-04
5 2020-05-04 2020-05-18
6 2020-05-18 2020-06-01
7 2020-06-01 2020-06-15
8 2020-06-15 2020-06-29
9 2020-06-29 2020-07-13
10 2020-07-13 2020-07-27
11 2020-07-27 2020-08-10
12 2020-08-10 2020-08-24
13 2020-08-24 2020-09-07
14 2020-09-07 2020-09-22
15 2020-09-22 2020-10-06
16 2020-10-06 2020-10-20
17 2020-10-20 2020-11-04
18 2020-11-04 2020-11-09


In [8]:
# 5.5.1 訓練集, 測試集, 數據集
data3 = separate_daily(df12, 'dict')
days3 = np.array(list(data3.keys()))
train_set, test_set, valid_set = days3[0:-55], days3[-55:-15], days3[-15:-1]
x_train, y_train, date_train = np.array([]), np.array([]), []
x_test, y_test, date_test = np.array([]), np.array([]), []
x_valid, y_valid, date_valid = np.array([]), np.array([]), []
# 5.5.2 窗口步長
t_pus_no = 5
window_size = 100

for day, _df8 in data3.items():
    _df8.drop(['udate'], axis=1, inplace=True)
    # 5.5.3 窗口
    no_max = _df8.shape[0]-t_pus_no
    x_data, y_data = [], []
    for i in range(window_size, no_max):
        start, end = i-window_size, i
        # y label
        temp_0 = _df8.iloc[end: end+t_pus_no]
        y_data.append(temp_0['Close'])
        # x matrix
        temp_1 = _df8.iloc[start: end]
        x_data.append(temp_1)
        # date
        date1 = temp_0.index.tolist()
        if day in train_set:
            date_train.append(date1)
        elif day in test_set:
            date_test.append(date1)
        elif day in valid_set:
            date_valid.append(date1)
    x_data, y_data = np.array(x_data), np.array(y_data)
    # print(day, x_data.shape, y_data.shape)
    # 5.5.4 分集1
    if day in train_set and x_train.any() and y_train.any():
        x_train = np.concatenate((x_train, x_data), axis=0)
        y_train = np.concatenate((y_train, y_data), axis=0)
    elif day in test_set and x_test.any() and y_test.any():
        x_test = np.concatenate((x_test, x_data), axis=0)
        y_test = np.concatenate((y_test, y_data), axis=0)
    elif day in valid_set and x_valid.any() and y_valid.any():
        x_valid = np.concatenate((x_valid, x_data), axis=0)
        y_valid = np.concatenate((y_valid, y_data), axis=0)
    
    # 5.5.5 分集2
    if day in train_set and not x_train.any() and not y_train.any():
        x_train, y_train = x_data, y_data
    elif day in test_set and not x_test.any() and not y_test.any():
        x_test, y_test = x_data, y_data
    elif day in valid_set and not x_valid.any() and not y_valid.any():
        x_valid, y_valid = x_data, y_data

# no_batches, timesteps, no_features
print('訓練集: X_train Data: {}, Y_train Data: {}, Date_Train: {}'.format(x_train.shape, y_train.shape, len(date_train)))
print('測試集: X_Test Data: {}, Y_Test Data: {}, Date_Test: {}'.format(x_test.shape, y_test.shape, len(date_test)))
print('验证集: X_Valid Data: {}, Y_Valid Data: {}, Date_Valid: {}'.format(x_valid.shape, y_valid.shape, len(date_valid)))

訓練集: X_train Data: (175966, 100, 18), Y_train Data: (175966, 5), Date_Train: 175966
測試集: X_Test Data: (55896, 100, 18), Y_Test Data: (55896, 5), Date_Test: 55896
验证集: X_Valid Data: (20384, 100, 18), Y_Valid Data: (20384, 5), Date_Valid: 20384


In [10]:
# 6.1.1 模型参数
batch_size = 1024
epochs = 2
units = 128
verbose = 1
no_batches = x_train.shape[0]
timesteps = x_train.shape[1]
no_features = x_train.shape[2]
batch_input_shape = (timesteps, no_features)

# 6.1.2 日志参数
prefix = 'nq-lstm'
cur_time = datetime.now().strftime("%Y%m%d-%H%M%S")

# 6.2 模型 
# activation=softsign/tanh
model = tf.keras.Sequential()
model.add(LSTM(units=units, recurrent_activation='sigmoid', activation='tanh', unroll=False, use_bias=True, 
                               recurrent_dropout=0, return_sequences=True, input_shape=batch_input_shape))
model.add(Dropout(rate=0.2))

model.add(LSTM(units=units, activation='tanh',return_sequences=True, input_shape=batch_input_shape))
model.add(Dropout(rate=0.1))

model.add(LSTM(units=units, activation='tanh',return_sequences=True, input_shape=batch_input_shape))
model.add(Dropout(rate=0.2))

model.add(LSTM(units=units, activation='tanh',return_sequences=True, input_shape=batch_input_shape))
model.add(Dropout(rate=0.1))

model.add(LSTM(units=units, activation='tanh',return_sequences=False, input_shape=batch_input_shape))
model.add(Dropout(rate=0.2))

model.add(Dense(units=t_pus_no))
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])
model.summary()

# 6.3.1 check point
checkpoint_dir = './training_checkpoints/'+ prefix +'-' + cur_time
os.mkdir(checkpoint_dir)
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

# 6.3.2 tensor board
log_dir = os.path.join('./logs/fit/'+ prefix +'-') + cur_time
os.mkdir(log_dir)
tensor_board_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

# 6.4 fit model
history_model = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test), shuffle=True,
                          verbose=verbose, callbacks=[checkpoint_callback, tensor_board_callback])

# 6.5 save model
model_path = "./saved_model/"+prefix+"-"+cur_time+".h5"
model.save(model_path)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_5 (LSTM)                (None, 100, 128)          75264     
_________________________________________________________________
dropout_5 (Dropout)          (None, 100, 128)          0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 100, 128)          131584    
_________________________________________________________________
dropout_6 (Dropout)          (None, 100, 128)          0         
_________________________________________________________________
lstm_7 (LSTM)                (None, 100, 128)          131584    
_________________________________________________________________
dropout_7 (Dropout)          (None, 100, 128)          0         
_________________________________________________________________
lstm_8 (LSTM)                (None, 100, 128)         

In [11]:
# 7.1 visualize loss
keys = list(history_model.history.keys())
training_loss = history_model.history['loss']
test_loss = history_model.history['val_loss']
epoch_count = range(1, len(training_loss) + 1)
plt.plot(epoch_count, training_loss, 'r--')
plt.plot(epoch_count, test_loss, 'b-')
plt.legend(['Train', 'Test'])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('model:'+prefix+'   batch_size:'+str(batch_size)+'   epochs:'+str(epochs)+'   units:'+str(units),loc ='left')
plt.tight_layout()
plt.grid()
plt.savefig('./data/img-nq/results/'+cur_time+'-loss')
plt.clf()

# 7.2 visualize accuracy
training_accuracy = history_model.history['accuracy']
test_accuracy = history_model.history['val_accuracy']
epoch_count = range(1, len(training_accuracy) + 1)
plt.plot(epoch_count, training_accuracy, 'r--')
plt.plot(epoch_count, test_accuracy, 'b-')
plt.legend(['Train', 'Test'])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('model:'+prefix+'   batch_size:'+str(batch_size)+'   epochs:'+str(epochs)+'   units:'+str(units),loc ='left')
plt.tight_layout()
plt.grid()
plt.savefig('./data/img-nq/results/'+cur_time+'-accuracy')
plt.clf()

<Figure size 432x288 with 0 Axes>

In [12]:
# 8.0 model evaluate
verbose = 1
train_score = model.evaluate(x_train, y_train, verbose=verbose)
test_score = model.evaluate(x_test, y_test, verbose=verbose)
valid_score = model.evaluate(x_valid, y_valid, verbose=verbose)

print('Train Score: %.4f MSE (%.4f RMSE)' % (train_score[0], math.sqrt(train_score[0])))
print('Test Score: %.4f MSE (%.4f RMSE)' % (test_score[0], math.sqrt(test_score[0])))
print('Validate Score: %.4f MSE (%.4f RMSE)' % (valid_score[0], math.sqrt(valid_score[0])))

Train Score: 0.0024 MSE (0.0485 RMSE)
Test Score: 0.0024 MSE (0.0491 RMSE)
Validate Score: 0.0027 MSE (0.0522 RMSE)


In [104]:
def scale_transform(data3):
    len_shape_y = x_valid.shape[2]-1
    fill_list = list(repeat(0, len_shape_y))
    data4 = []
    for list3 in data3:
        list4 = []
        for val3 in list3:
            list4.append([val3] + fill_list)
        data5 = min_max_scaler.inverse_transform(list4)
        data6 = [v[0] for v in data5]
        data4.append(data6)
    return data4

# 9.0 inverse
predict1 = model.predict(x_valid)
predict2 = scale_transform(predict1)
t1 = [v[0] for v in predict2]
t2 = [v[1] for v in predict2]
t3 = [v[2] for v in predict2]
t4 = [v[3] for v in predict2]
t5 = [v[4] for v in predict2]
date8 = [v[0] for v in date_valid]
df9 = pd.DataFrame({'udate': pd.to_datetime(date8), 't1': t1, 't2': t2, 't3': t3, 't4': t4, 't5': t5})
df9.index = df9['udate']

In [105]:
# 10.0 combine
df13 = df7.drop(list(df7.columns)[6:], axis=1)
df13 = df13.loc[df9.index]
df14 = pd.concat([df13, df9], axis=1)

"""
# 10.1
start2, end2 =  date8[0], date8[-2:-1][0]
start3 = start2 - timedelta(minutes=t_pus_no+window_size)
mask3 = ((df14.index >= start3) & (df14.index <= start2)) # 窗口步長
mask4 = ((df14.index >= start2) & (df14.index <= end2)  & (df14['t1'] > 0)) # 預測
df14 = df14.loc[(mask3 | mask4)]
"""

# 10.2 save
path_1 = os.path.abspath(os.path.join('data', 'nq', 'prediction', 'nq-prediction.csv'))
if os.path.exists(path_1):
    os.remove(path_1)
df14 = df14.drop(['udate'], axis=1)
df14.to_csv(path_1)

df14

Unnamed: 0_level_0,High,Low,Open,Close,Volume,t1,t2,t3,t4,t5
udate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-10-19 18:40:00,11715.75,11722.25,11715.75,11719.00,224.0,11852.385833,11853.004088,11852.284521,11852.038346,11852.804546
2020-10-19 18:41:00,11719.00,11721.75,11718.75,11720.50,63.0,11855.318874,11855.918852,11855.211566,11854.922688,11855.733203
2020-10-19 18:42:00,11720.50,11720.50,11717.50,11719.50,50.0,11858.145896,11858.707310,11858.041729,11857.711985,11858.558376
2020-10-19 18:43:00,11719.50,11720.00,11717.50,11719.50,46.0,11860.143048,11860.637495,11860.053585,11859.693169,11860.560085
2020-10-19 18:44:00,11719.50,11720.25,11718.00,11719.50,36.0,11860.783622,11861.186120,11860.716981,11860.345440,11861.211735
...,...,...,...,...,...,...,...,...,...,...
2020-11-09 15:51:00,11841.75,11844.00,11837.25,11837.50,106.0,11831.644511,11832.197803,11831.646874,11831.241271,11831.442612
2020-11-09 15:52:00,11837.50,11841.50,11837.25,11840.25,52.0,11834.198248,11834.774372,11834.224724,11833.747126,11834.072487
2020-11-09 15:53:00,11840.25,11841.00,11836.50,11839.50,34.0,11836.539757,11837.092140,11836.606330,11836.059816,11836.493046
2020-11-09 15:54:00,11839.50,11843.75,11835.25,11835.25,89.0,11838.229521,11838.718890,11838.343018,11837.741571,11838.255063
