In [10]:
import tensorflow as tf
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import mplfinance as mpf
import talib as talib
import numpy as np
import data as ds
import common as common
import os as os
import math as math
import datetime as datetime
import scipy as sp
import itertools  as itertools
import multiprocessing as mp
from os import listdir, walk
from itertools import repeat
from mplfinance.original_flavor import candlestick_ohlc
from datetime import datetime, timedelta, date
from tensorflow.keras.layers import LSTM, Dropout, Dense
from tensorflow.keras.optimizers import SGD
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score, pairwise, mean_squared_error, mean_absolute_error
from scipy import stats
from pprint import pprint

In [2]:
# 0.1 环境设定
pd.options.mode.chained_assignment = None

# 0.2 不让程序占满 GPU 内存
gpus = tf.config.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [6]:
# 1.0 重構數據集
def reshape_dataframe(df0):
    df0.fillna(0, inplace=True)
    df0.replace([np.inf, -np.inf], 0, inplace=True)
    df0.columns = ['udate', 'High', 'Low', 'Open', 'Close', 'Volume']
    types2 = {'udate': 'object', 'High': 'float64', 'Low': 'float64', 'Open': 'float64', 'Close': 'float64', 'Volume': 'int64'}
    df0.astype(types2).dtypes
    df0_1 = df0.copy(deep=True)
    error_row = []
    for k, v in df0.iterrows():
        if not pd.isnull(df0['udate'].iloc[k]) and df0['udate'].iloc[k] > 0:
            stime = str(int(df0['udate'].iloc[k]))
            df0_1['udate'].iloc[k] = datetime(year=2020, month=int(stime[-8:-6]), day=int(stime[-6:-4]), hour=int(stime[-4:-2]), minute=int(stime[-2:]), second=0)
        else:
            error_row.append(k)
    df0_1.drop(df0_1.index[error_row], inplace=True)
    df0_1.udate = pd.to_datetime(df0_1.udate)
    df0_1.index = pd.to_datetime(df0_1.udate)
    
    # 1.0.1 數據有效性檢查
    for k, v in types2.items():
        if (df0_1[k].isin([np.nan]).any().any()):
            print(k+' obtains nan')
        if (df0_1[k].isin([0]).any().any()):
            print(k+' obtains 0')
    is_contain_null = df0_1.isnull().sum()

    return df0_1

# 1.1 数据源
files = []
file_1 = os.path.abspath(os.path.join('data', 'nq', 'data', 'nq-20201105.csv'))
df1_1 = reshape_dataframe(pd.read_csv(file_1))
files.append(df1_1)

file_2 = os.path.abspath(os.path.join('data', 'nq', 'data', 'nq-20201105_Aug_Sep.csv'))
df1_2 = reshape_dataframe(pd.read_csv(file_2))
files.append(df1_2)

file_3 = os.path.abspath(os.path.join('data', 'nq', 'data', 'nq-20201110.csv'))
df1_3 = reshape_dataframe(pd.read_csv(file_3))
files.append(df1_3)

file_4 = os.path.abspath(os.path.join('data', 'nq', 'data', 'nq-20201111.csv'))
df1_4 = reshape_dataframe(pd.read_csv(file_4))
files.append(df1_4)

for i in range(2, 12):
    try:
        month1 = str(i).zfill(2)
        file_5 = os.path.abspath(os.path.join('data', 'nq', 'data', 'nq-20-'+month1+'.csv'))
        df1_5 = reshape_dataframe(pd.read_csv(file_5))
        files.append(df1_5)
    except:
        print('no month', i)

df2 = pd.concat(files, ignore_index=False)

# 1.2 刪除重覆index
df2 = df2.groupby(df2.index).first()

# 1.3 排序
df2.sort_index(axis=0, ascending=True, inplace=True)
df2

Volume obtains 0
Volume obtains 0
Volume obtains 0
Volume obtains 0
Volume obtains 0
Volume obtains 0
Volume obtains 0
Volume obtains 0
Volume obtains 0
Volume obtains 0
Volume obtains 0
Volume obtains 0
no month 10
Volume obtains 0


Unnamed: 0_level_0,udate,High,Low,Open,Close,Volume
udate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-02-16 17:01:00,2020-02-16 17:01:00,9650.00,9657.00,9649.00,9654.00,459.0
2020-02-16 17:02:00,2020-02-16 17:02:00,9654.00,9656.75,9653.00,9653.75,468.0
2020-02-16 17:03:00,2020-02-16 17:03:00,9653.75,9654.00,9653.00,9653.50,168.0
2020-02-16 17:04:00,2020-02-16 17:04:00,9653.50,9657.50,9653.25,9656.50,356.0
2020-02-16 17:05:00,2020-02-16 17:05:00,9656.50,9658.25,9655.50,9657.25,260.0
...,...,...,...,...,...,...
2020-11-19 02:25:00,2020-11-19 02:25:00,11864.25,11871.50,11864.25,11869.25,155.0
2020-11-19 02:26:00,2020-11-19 02:26:00,11869.25,11871.25,11868.00,11868.50,114.0
2020-11-19 02:27:00,2020-11-19 02:27:00,11868.50,11872.25,11868.25,11870.50,85.0
2020-11-19 02:28:00,2020-11-19 02:28:00,11870.50,11873.25,11869.25,11873.00,80.0


In [7]:
# 2.0 分包 (日子)
def separate_daily(df2_1, return_type):
    data1 = {}
    # 2.1交易日
    days1 = list(dict.fromkeys([v.date() for v in df2_1['udate']]))
    for day in days1:
        # 2.2 交易時間
        day_start = datetime(day.year, day.month, day.day, 17, 0, 0)
        day2 = day + timedelta(days=1)
        day_end = datetime(day2.year, day2.month, day2.day, 16, 0, 0)
        mask = ((df2_1['udate'] >= day_start) & (df2_1['udate'] <= day_end))
        df2_2 = df2_1.loc[mask]
        if (df2_2.shape[0] > 1):
            data1[day] = df2_2
    # 2.3 合併
    df2_3 = pd.DataFrame()
    for k, df2_4 in data1.items():
        df2_3 = pd.concat([df2_3, df2_4], axis=0, join='outer', ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, copy=True)
    # 2.4 返回類型
    if return_type=='df':
        return df2_3
    elif return_type=='dict':
        return data1

df2_5 = df2.copy(deep=True)
df3 = separate_daily(df2_5, 'df')
df3

Unnamed: 0_level_0,udate,High,Low,Open,Close,Volume
udate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-02-16 17:01:00,2020-02-16 17:01:00,9650.00,9657.00,9649.00,9654.00,459.0
2020-02-16 17:02:00,2020-02-16 17:02:00,9654.00,9656.75,9653.00,9653.75,468.0
2020-02-16 17:03:00,2020-02-16 17:03:00,9653.75,9654.00,9653.00,9653.50,168.0
2020-02-16 17:04:00,2020-02-16 17:04:00,9653.50,9657.50,9653.25,9656.50,356.0
2020-02-16 17:05:00,2020-02-16 17:05:00,9656.50,9658.25,9655.50,9657.25,260.0
...,...,...,...,...,...,...
2020-11-19 02:25:00,2020-11-19 02:25:00,11864.25,11871.50,11864.25,11869.25,155.0
2020-11-19 02:26:00,2020-11-19 02:26:00,11869.25,11871.25,11868.00,11868.50,114.0
2020-11-19 02:27:00,2020-11-19 02:27:00,11868.50,11872.25,11868.25,11870.50,85.0
2020-11-19 02:28:00,2020-11-19 02:28:00,11870.50,11873.25,11869.25,11873.00,80.0


In [8]:
# 3.0 技術指標
highs = np.array(df3['High'], dtype='float')
lows = np.array(df3['Low'], dtype='float')
opens = np.array(df3['Open'], dtype='float')
closes = np.array(df3['Close'], dtype='float')
vols = np.array(df3['Volume'], dtype='float')
# 3.1 SMA 均線
for v in [5, 10, 20, 50, 100]:
    df3['sma-'+str(v)]= talib.SMA(closes, timeperiod=v)
# 3.2 Bollinger 保力加
df3['upper-band'], df3['middle-band'], df3['lower-band'] = talib.BBANDS(closes, timeperiod=20*1.5, nbdevup=2, nbdevdn=2, matype=0)
# 3.3 %B %保力加
df3['%b'] = (df3['Close']-df3['lower-band'])/(df3['upper-band']-df3['lower-band'])*100
df3['%b-high']  = common.percentB_belowzero(df3['%b'], df3['Close']) 
df3['%b-low'] = common.percentB_aboveone(df3['%b'], df3['Close'])
# 3.4 VOL EMA
df3['vol-ema5'] = talib.EMA(vols, timeperiod=5*4)
# 3.5 P-SAR 抛物线
df3['p-sar'] = talib.SAR(highs, lows, acceleration=0.02, maximum=0.2)
# 3.6 VWAP 成交量加權平均價格
period = [5, 10, 20, 50, 100]
for v in period:
    df3['typical-price'] = (df3['High'] + df3['Low'] + df3['Close']) / 3
    df3['turnover'] = df3['typical-price'] * df3['Volume']
    df3['cum-turnover-'+str(v)] = df3['turnover'].rolling(window=v).sum()
    df3['cum-volume-'+str(v)] = df3['Volume'].rolling(window=v).sum()
    df3['vwap-'+str(v)] = df3['cum-turnover-'+str(v)] / df3['cum-volume-'+str(v)]
    df3['vwap-'+str(v)] = df3['vwap-'+str(v)].replace([np.inf, -np.inf], 0)
    df3['vwap-'+str(v)].fillna(0, inplace=True)
    drop_list_1 = ['turnover', 'typical-price', 'cum-turnover-'+str(v), 'cum-volume-'+str(v)]
    df3.drop(drop_list_1, axis=1, inplace=True)
# 2.7 MACD
df3['macd'], df3['macdsignal'], df3['macdhist'] = talib.MACD(closes, fastperiod=12, slowperiod=26, signalperiod=9*40)
# 2.8 KDJ
df3['k-kdj'], df3['d-kdj'], df3['j-kdj'] = common.kdj(highs, lows, closes, window_size=20)
df3['diff-kdj'] = df3['k-kdj']-df3['d-kdj']
df3['j-kdj'].loc[((df3['j-kdj'] > 20) & (df3['j-kdj'] < 100))] = 0
df3

Unnamed: 0_level_0,udate,High,Low,Open,Close,Volume,sma-5,sma-10,sma-20,sma-50,...,vwap-20,vwap-50,vwap-100,macd,macdsignal,macdhist,k-kdj,d-kdj,j-kdj,diff-kdj
udate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-02-16 17:01:00,2020-02-16 17:01:00,9650.00,9657.00,9649.00,9654.00,459.0,,,,,...,0.000000,0.000000,0.000000,,,,,,,
2020-02-16 17:02:00,2020-02-16 17:02:00,9654.00,9656.75,9653.00,9653.75,468.0,,,,,...,0.000000,0.000000,0.000000,,,,,,,
2020-02-16 17:03:00,2020-02-16 17:03:00,9653.75,9654.00,9653.00,9653.50,168.0,,,,,...,0.000000,0.000000,0.000000,,,,,,,
2020-02-16 17:04:00,2020-02-16 17:04:00,9653.50,9657.50,9653.25,9656.50,356.0,,,,,...,0.000000,0.000000,0.000000,,,,,,,
2020-02-16 17:05:00,2020-02-16 17:05:00,9656.50,9658.25,9655.50,9657.25,260.0,9655.00,,,,...,0.000000,0.000000,0.000000,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-11-19 02:25:00,2020-11-19 02:25:00,11864.25,11871.50,11864.25,11869.25,155.0,11864.95,11860.750,11863.1625,11871.445,...,11863.640685,11870.920632,11879.508421,-2.454541,-0.377310,-2.077232,4.454301,43.183389,-73.003874,-38.729088
2020-11-19 02:26:00,2020-11-19 02:26:00,11869.25,11871.25,11868.00,11868.50,114.0,11866.05,11862.175,11863.0250,11871.220,...,11863.350783,11870.841748,11879.356284,-1.868293,-0.385570,-1.482724,4.322332,41.802709,-70.638423,-37.480377
2020-11-19 02:27:00,2020-11-19 02:27:00,11868.50,11872.25,11868.25,11870.50,85.0,11867.35,11863.800,11862.9125,11871.160,...,11863.338818,11870.616013,11879.225244,-1.228147,-0.390238,-0.837909,4.484216,40.446200,-67.439752,-35.961984
2020-11-19 02:28:00,2020-11-19 02:28:00,11870.50,11873.25,11869.25,11873.00,80.0,11869.10,11865.350,11862.9625,11871.110,...,11863.203787,11870.508475,11879.148764,-0.513182,-0.390919,-0.122263,4.866898,39.118052,-63.635409,-34.251154


In [12]:
# 4.0 draw chart
drop_list_2 = []
is_render_chart = False

def draw_worker(df4_2, day):
    try:
        # 4.1 style
        style = mpf.make_mpf_style(base_mpf_style='charles', rc={'font.size':6})
        # 4.2 addplot
        apds = [mpf.make_addplot(df4_2['lower-band'],panel=0,color='orange',linestyle='solid'),
                mpf.make_addplot(df4_2['upper-band'],panel=0,color='bisque',linestyle='solid'),
                mpf.make_addplot(df4_2['vwap-50'].replace(0, np.nan),panel=0,color='aqua',linestyle='solid'),
                mpf.make_addplot(df4_2['%b-low'],type='scatter',markersize=20,marker='v',panel=0),
                mpf.make_addplot(df4_2['%b-high'],type='scatter',markersize=20,marker='^',panel=0),
                mpf.make_addplot(df4_2['p-sar'],scatter=True,markersize=1,marker='*',panel=0,color='blueviolet'),
                #
                mpf.make_addplot(df4_2['vol-ema5'],panel=1,color='orange'),
                #
                mpf.make_addplot(df4_2['macd'],panel=2,color='orange'),
                mpf.make_addplot(df4_2['macdsignal'],panel=2,color='violet'),
                mpf.make_addplot(df4_2['macdhist'],panel=2,type='bar',color='dimgray'),
                #
                mpf.make_addplot(df4_2['k-kdj'],panel=3,color='orange'),
                mpf.make_addplot(df4_2['d-kdj'],panel=3,color='violet'),
                mpf.make_addplot(df4_2['j-kdj'],panel=3,color='aqua'),
                mpf.make_addplot(df4_2['diff-kdj'],panel=3,type='bar',color='dimgray')]
        # 4.3 draw
        mpf.plot(df4_2, type='candle', addplot=apds, style=style, ylabel='', ylabel_lower='', volume=True, figscale=0.5, xrotation=0, datetime_format="%H:%M", show_nontrading=False, tight_layout=True, savefig='./data/img-nq/features/'+day.strftime('%m-%d-%Y'))
        print('finish draw:', df4_2.shape, df4_2['udate'].iloc[0], df4_2['udate'].iloc[-1])
    except:
        print('do not draw:', k, '\n')

# 4.5 multi processing
pool = mp.Pool(processes=4, maxtasksperchild=4)
# 4.6 draw
df4 = df3.copy(deep=True)
data2 = separate_daily(df4, 'dict')
for k, df4_1 in data2.items():
    # 4.7 drop error day
    if df4_1['Volume'].shape[0] == df4_1['Volume'].isin([0]).sum() or df4_1['Volume'].shape[0] <= 100:
        drop_list_2.append(k)
    # 4.8 draw chart
    elif (is_render_chart):
        pool.apply_async(draw_worker, args=(df4_1, k))
# 4.9 kill multi processing
pool.close()
pool.join()

drop_list_2.append(date(2020, 3, 8))
drop_list_2.append(date(2020, 3, 15))
drop_list_2.append(date(2020, 3, 17))
drop_list_2.append(date(2020, 5, 24))
drop_list_2.append(date(2020, 9, 6))
for k in drop_list_2:
    data2.pop(k, None)
    print('excpet & delete: ', k)

excpet & delete:  2020-02-16
excpet & delete:  2020-02-18
excpet & delete:  2020-02-19
excpet & delete:  2020-02-20
excpet & delete:  2020-09-17
excpet & delete:  2020-10-29
excpet & delete:  2020-03-08
excpet & delete:  2020-03-15
excpet & delete:  2020-03-17
excpet & delete:  2020-05-24
excpet & delete:  2020-09-06


In [13]:
# 5.0 合併
df5 = pd.DataFrame()
for k, df5_1 in data2.items():
    df5 = pd.concat([df5, df5_1], axis=0, join='outer', ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, copy=True)

# 5.1 清洗
df5_3 = df5.copy(deep=True)
drop_list_3 = ['High', 'Low', 'Open', 'Volume', 'macd', 'macdsignal', 'middle-band', 'k-kdj', 'd-kdj', 'vol-ema5', 'sma-10', 'vwap-10']
df5_3.drop(drop_list_3, axis=1, inplace=True)
df5_3.fillna(0, inplace=True)
df5_3 = df5_3.round(2)

# 5.2 檢查
is_contain_null = df5_3.isnull().sum()
is_contain_nan = df5_3.isna().sum()
is_contain_inf = df5_3.isin([np.nan]).sum()
print('数据集: Total dataset has {} samples, and {} features.'.format(df5_3.shape[0], df5_3.shape[1])) # df5_3.info()

# 5.3 儲存
path_data = os.path.abspath(os.path.join('data', 'nq', 'clean-data', 'nq-clean-data-with-features.csv'))
if os.path.exists(path_data):
    os.remove(path_data)
df5_3.to_csv(path_data)

# 5.4.1 分包 (日子)
data3 = separate_daily(df5_3, 'dict')
# 5.3.2 正則化
normalization_days = 10
days2 = np.array(list(data3.keys()))
len_days2 = (days2.shape[0]//normalization_days)+1
data3_1 = []
scalers = {}
for i in range(len_days2):
    try:
        start1, end1 = i*normalization_days, (i+1)*normalization_days
        if end1 >= days2.shape[0]:
            end1 = days2.shape[0]-1
        # 5.3.3 每X日集合
        day_start, day_end = days2[start1], days2[end1]
        day_start2 = datetime(day_start.year, day_start.month, day_start.day, 17, 0, 0)
        day_end2 = datetime(day_end.year, day_end.month, day_end.day, 16, 0, 0)
        mask = ((df5_3['udate'] >= day_start2) & (df5_3['udate'] <= day_end2))
        df5_4 = df5_3[mask]
        # 5.4.4 正則代集合
        min_max_scaler = MinMaxScaler(feature_range=(-0.99, 0.99))
        df5_5 = df5_4.drop(['udate'], axis=1)
        min_max_scaler.fit(df5_5)
        scalers[i] = {'scaler': min_max_scaler, 'day_start': day_start2, 'day_end': day_end2}
        df5_6 = min_max_scaler.transform(df5_5)
        df5_6 = pd.DataFrame(df5_6, columns=df5_5.columns, index=df5_5.index)
        df5_6['udate'] = df5_4['udate']
        # 5.4.5 合併
        data3_1.append(df5_6)
    except:
        print('except: ', start1, end1, len(days2))

# 5.4.6 合併
df5_7 = pd.DataFrame()
for df5_8 in data3_1:
    df5_7 = pd.concat([df5_7, df5_8], axis=0, join='outer', ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, copy=True)
df5_7

数据集: Total dataset has 253011 samples, and 19 features.


Unnamed: 0_level_0,Close,sma-5,sma-20,sma-50,sma-100,upper-band,lower-band,%b,%b-high,%b-low,p-sar,vwap-5,vwap-20,vwap-50,vwap-100,macdhist,j-kdj,diff-kdj,udate
udate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2020-02-23 17:00:00,0.970192,0.990000,0.990000,0.990000,0.990000,0.906962,0.990000,-0.977270,0.99,-0.99,0.990000,0.990000,0.990000,0.990000,0.990000,0.114264,-0.267126,0.044846,2020-02-23 17:00:00
2020-02-23 17:01:00,0.979085,0.921577,0.971608,0.979392,0.985862,0.939174,0.930054,-0.641074,-0.99,-0.99,0.979085,0.952584,0.956905,0.959034,0.967751,0.114264,-0.267126,0.044846,2020-02-23 17:01:00
2020-02-23 17:02:00,0.983532,0.853936,0.953509,0.968866,0.981765,0.959125,0.883882,-0.488656,-0.99,-0.99,0.914989,0.936584,0.936244,0.940199,0.952357,0.114264,-0.267126,0.044846,2020-02-23 17:02:00
2020-02-23 17:03:00,0.990000,0.788074,0.935828,0.958407,0.977667,0.971551,0.845919,-0.391878,-0.99,-0.99,0.863439,0.921215,0.923321,0.928370,0.942080,0.114264,-0.267126,0.044846,2020-02-23 17:03:00
2020-02-23 17:04:00,0.969788,0.717802,0.917310,0.947623,0.973410,0.981126,0.809544,-0.353352,-0.99,-0.99,0.822202,0.900003,0.907189,0.910841,0.926422,0.114264,-0.267126,0.044846,2020-02-23 17:04:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-11-18 15:56:00,-0.373852,-0.380054,-0.423430,-0.373524,-0.024744,-0.476709,-0.341173,-0.201462,-0.99,-0.99,-0.377818,-0.385021,-0.430444,-0.285787,0.144159,0.037432,-0.889240,-0.691577,2020-11-18 15:56:00
2020-11-18 15:57:00,-0.375517,-0.387631,-0.425840,-0.378466,-0.032726,-0.477566,-0.342473,-0.205590,-0.99,-0.99,-0.378507,-0.390321,-0.432391,-0.291870,0.141731,0.036399,-0.868218,-0.663111,2020-11-18 15:57:00
2020-11-18 15:58:00,-0.363860,-0.392108,-0.427812,-0.382359,-0.040788,-0.484347,-0.340215,-0.101505,-0.99,-0.99,-0.380091,-0.392317,-0.436283,-0.295654,0.140086,0.044660,-0.847599,-0.635112,2020-11-18 15:58:00
2020-11-18 15:59:00,-0.377183,-0.397963,-0.429784,-0.386776,-0.048849,-0.489914,-0.339326,-0.206225,-0.99,-0.99,-0.381606,-0.399407,-0.438230,-0.299437,0.138363,0.044143,-0.830363,-0.610613,2020-11-18 15:59:00


In [14]:
# 5.5.1 訓練集, 測試集, 數據集
data4 = separate_daily(df5_7, 'dict')
days3 = np.array(list(data4.keys()))
train_set, test_set, valid_set = days3[0:-55], days3[-55:-15], days3[-15:-1]
x_train, y_train, date_train = np.array([]), np.array([]), []
x_test, y_test, date_test = np.array([]), np.array([]), []
x_valid, y_valid, date_valid = np.array([]), np.array([]), []
# 5.5.2 窗口步長
t_pus_no = 5
window_size = 100

for day, df6 in data4.items():
    df6.drop(['udate'], axis=1, inplace=True)
    # 5.5.3 窗口
    no_max = df6.shape[0]-t_pus_no
    x_data, y_data = [], []
    for i in range(window_size, no_max):
        start, end = i-window_size, i
        # y label
        temp_0 = df6.iloc[end: end+t_pus_no]
        y_data.append(temp_0['Close'])
        # x matrix
        temp_1 = df6.iloc[start: end]
        x_data.append(temp_1)
        # date
        days4 = temp_0.index.tolist()
        if day in train_set:
            date_train.append(days4)
        elif day in test_set:
            date_test.append(days4)
        elif day in valid_set:
            date_valid.append(days4)
    x_data, y_data = np.array(x_data), np.array(y_data)
    # print(day, x_data.shape, y_data.shape)
    # 5.5.4 分集1
    if day in train_set and x_train.any() and y_train.any():
        x_train = np.concatenate((x_train, x_data), axis=0)
        y_train = np.concatenate((y_train, y_data), axis=0)
    elif day in test_set and x_test.any() and y_test.any():
        x_test = np.concatenate((x_test, x_data), axis=0)
        y_test = np.concatenate((y_test, y_data), axis=0)
    elif day in valid_set and x_valid.any() and y_valid.any():
        x_valid = np.concatenate((x_valid, x_data), axis=0)
        y_valid = np.concatenate((y_valid, y_data), axis=0)
    # 5.5.5 分集2
    if day in train_set and not x_train.any() and not y_train.any():
        x_train, y_train = x_data, y_data
    elif day in test_set and not x_test.any() and not y_test.any():
        x_test, y_test = x_data, y_data
    elif day in valid_set and not x_valid.any() and not y_valid.any():
        x_valid, y_valid = x_data, y_data

# no_batches, timesteps, no_features
print('訓練集: X_train Data: {}, Y_train Data: {}, Date_Train: {}'.format(x_train.shape, y_train.shape, len(date_train)))
print('測試集: X_Test Data: {}, Y_Test Data: {}, Date_Test: {}'.format(x_test.shape, y_test.shape, len(date_test)))
print('验证集: X_Valid Data: {}, Y_Valid Data: {}, Date_Valid: {}'.format(x_valid.shape, y_valid.shape, len(date_valid)))

訓練集: X_train Data: (163672, 100, 18), Y_train Data: (163672, 5), Date_Train: 163672
測試集: X_Test Data: (50432, 100, 18), Y_Test Data: (50432, 5), Date_Test: 50432
验证集: X_Valid Data: (17651, 100, 18), Y_Valid Data: (17651, 5), Date_Valid: 17651


In [15]:
# 6.1.1 模型参数
batch_size = 1024
epochs = 2
units = 128
verbose = 1
no_batches = x_train.shape[0]
timesteps = x_train.shape[1]
no_features = x_train.shape[2]
batch_input_shape = (timesteps, no_features)

# 6.1.2 日志参数
prefix = 'nq-lstm'
cur_time = datetime.now().strftime("%Y%m%d-%H%M%S")

# 6.2 模型 
# activation=softsign/tanh
model = tf.keras.Sequential()
model.add(LSTM(units=units, recurrent_activation='sigmoid', activation='tanh', unroll=False, use_bias=True, 
                               recurrent_dropout=0, return_sequences=True, input_shape=batch_input_shape))
model.add(Dropout(rate=0.2))

model.add(LSTM(units=units, activation='tanh',return_sequences=True, input_shape=batch_input_shape))
model.add(Dropout(rate=0.1))

model.add(LSTM(units=units, activation='tanh',return_sequences=True, input_shape=batch_input_shape))
model.add(Dropout(rate=0.2))

model.add(LSTM(units=units, activation='tanh',return_sequences=True, input_shape=batch_input_shape))
model.add(Dropout(rate=0.1))

model.add(LSTM(units=units, activation='tanh',return_sequences=False, input_shape=batch_input_shape))
model.add(Dropout(rate=0.2))

model.add(Dense(units=t_pus_no))
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])
model.summary()

# 6.3.1 check point
checkpoint_dir = './training_checkpoints/'+ prefix +'-' + cur_time
os.mkdir(checkpoint_dir)
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

# 6.3.2 tensor board
log_dir = os.path.join('./logs/fit/'+ prefix +'-') + cur_time
os.mkdir(log_dir)
tensor_board_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

# 6.4 fit model
history_model = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test), shuffle=True,
                          verbose=verbose, callbacks=[checkpoint_callback, tensor_board_callback])

# 6.5 save model
model_path = "./saved_model/"+prefix+"-"+cur_time+".h5"
model.save(model_path)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 100, 128)          75264     
_________________________________________________________________
dropout (Dropout)            (None, 100, 128)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100, 128)          131584    
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 128)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100, 128)          131584    
_________________________________________________________________
dropout_2 (Dropout)          (None, 100, 128)          0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 100, 128)          1

In [16]:
# 7.1 visualize loss
keys = list(history_model.history.keys())
training_loss = history_model.history['loss']
test_loss = history_model.history['val_loss']
epoch_count = range(1, len(training_loss) + 1)
plt.plot(epoch_count, training_loss, 'r--')
plt.plot(epoch_count, test_loss, 'b-')
plt.legend(['Train', 'Test'])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('model:'+prefix+'   batch_size:'+str(batch_size)+'   epochs:'+str(epochs)+'   units:'+str(units),loc ='left')
plt.tight_layout()
plt.grid()
plt.savefig('./data/img-nq/results/'+cur_time+'-loss')
plt.clf()

# 7.2 visualize accuracy
training_accuracy = history_model.history['accuracy']
test_accuracy = history_model.history['val_accuracy']
epoch_count = range(1, len(training_accuracy) + 1)
plt.plot(epoch_count, training_accuracy, 'r--')
plt.plot(epoch_count, test_accuracy, 'b-')
plt.legend(['Train', 'Test'])
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('model:'+prefix+'   batch_size:'+str(batch_size)+'   epochs:'+str(epochs)+'   units:'+str(units),loc ='left')
plt.tight_layout()
plt.grid()
plt.savefig('./data/img-nq/results/'+cur_time+'-accuracy')
plt.clf()

<Figure size 432x288 with 0 Axes>

In [17]:
# 8.0 model evaluate
verbose = 1
train_score = model.evaluate(x_train, y_train, verbose=verbose)
test_score = model.evaluate(x_test, y_test, verbose=verbose)
valid_score = model.evaluate(x_valid, y_valid, verbose=verbose)

print('Train Score: %.4f MSE (%.4f RMSE)' % (train_score[0], math.sqrt(train_score[0])))
print('Test Score: %.4f MSE (%.4f RMSE)' % (test_score[0], math.sqrt(test_score[0])))
print('Validate Score: %.4f MSE (%.4f RMSE)' % (valid_score[0], math.sqrt(valid_score[0])))

Train Score: 0.0017 MSE (0.0407 RMSE)
Test Score: 0.0013 MSE (0.0358 RMSE)
Validate Score: 0.0029 MSE (0.0538 RMSE)


In [18]:
# 9.0 預測
predict1 = model.predict(x_valid)
df7 = pd.DataFrame(predict1, columns=['t1', 't2', 't3', 't4', 't5'])
date_valid2 = [v[0] for v in date_valid]
df7['udate'] = date_valid2
df7.index = df7['udate']
df7

Unnamed: 0_level_0,t1,t2,t3,t4,t5,udate
udate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-10-27 18:40:00,-0.189017,-0.188799,-0.186604,-0.185304,-0.186657,2020-10-27 18:40:00
2020-10-27 18:41:00,-0.187717,-0.187540,-0.185282,-0.183918,-0.185305,2020-10-27 18:41:00
2020-10-27 18:42:00,-0.186354,-0.186208,-0.183891,-0.182464,-0.183891,2020-10-27 18:42:00
2020-10-27 18:43:00,-0.185230,-0.185098,-0.182719,-0.181247,-0.182728,2020-10-27 18:43:00
2020-10-27 18:44:00,-0.184622,-0.184487,-0.182032,-0.180547,-0.182101,2020-10-27 18:44:00
...,...,...,...,...,...,...
2020-11-17 15:51:00,0.333005,0.329742,0.330193,0.328495,0.329720,2020-11-17 15:51:00
2020-11-17 15:52:00,0.327428,0.324070,0.324606,0.322782,0.323974,2020-11-17 15:52:00
2020-11-17 15:53:00,0.321307,0.317849,0.318448,0.316504,0.317686,2020-11-17 15:53:00
2020-11-17 15:54:00,0.314598,0.311093,0.311733,0.309649,0.310827,2020-11-17 15:54:00


In [19]:
# 10.0 
len_shape_y = x_valid.shape[2]-1
fill_list = list(repeat(0, len_shape_y))
df8 = pd.DataFrame()

# 10.1 逆向
for k, v in scalers.items():
    mask = ((df7['udate'] >= v['day_start']) & (df7['udate'] <= v['day_end']))
    df8_1 = df7.loc[mask]
    df8_2 = df8_1.drop(['udate'], axis=1)
    # 10.2
    data4 = []
    for k1, v1 in df8_2.iterrows():
        data4_1 = []
        for v2 in v1:
            data4_1.append([v2] + fill_list)
        data4_2 = v['scaler'].inverse_transform(data4_1)
        data4_3 = [v[0] for v in data4_2]
        data4.append(data4_3)
    # 10.3
    df8_3 = pd.DataFrame(data4, columns=['t1', 't2', 't3', 't4', 't5'])
    df8_3.index = df8_1['udate']
    # 10.4 合併
    if df8_3.shape[0] > 0:
        df8 = pd.concat([df8, df8_3], axis=0, join='outer', ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, copy=True)

df8['udate'] = df8.index.values
df8 = df8[['udate', 't1', 't2', 't3', 't4', 't5']]
# df8[df8.index.duplicated()]
df8

Unnamed: 0_level_0,udate,t1,t2,t3,t4,t5
udate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-10-27 18:40:00,2020-10-27 18:40:00,11534.933014,11535.094050,11536.714506,11537.674609,11536.675380
2020-10-27 18:41:00,2020-10-27 18:41:00,11535.892489,11536.023862,11537.691102,11538.697977,11537.673519
2020-10-27 18:42:00,2020-10-27 18:42:00,11536.899364,11537.007202,11538.717694,11539.771221,11538.717826
2020-10-27 18:43:00,2020-10-27 18:43:00,11537.729435,11537.826678,11539.583381,11540.669982,11539.576537
2020-10-27 18:44:00,2020-10-27 18:44:00,11538.178382,11538.278122,11540.090620,11541.186739,11540.039699
...,...,...,...,...,...,...
2020-11-17 15:51:00,2020-11-17 15:51:00,11980.367762,11979.877878,11979.945567,11979.690793,11979.874701
2020-11-17 15:52:00,2020-11-17 15:52:00,11979.530615,11979.026482,11979.106940,11978.833115,11979.011972
2020-11-17 15:53:00,2020-11-17 15:53:00,11978.611646,11978.092408,11978.182467,11977.890527,11978.067979
2020-11-17 15:54:00,2020-11-17 15:54:00,11977.604461,11977.078279,11977.174365,11976.861441,11977.038263


In [20]:
# 11.1
df9 = df3.drop(list(df3.columns)[6:], axis=1)
df9_1 = pd.concat([df9, df8], axis=1)

# 11.2 
start2, end2 =  date_valid2[0], date_valid2[-2:-1][0]
start3 = start2 - timedelta(minutes=t_pus_no+window_size)
mask3 = ((df9_1.index >= start3) & (df9_1.index <= start2)) # 窗口步長
mask4 = ((df9_1.index >= start2) & (df9_1.index <= end2)  & (df9_1['t1'] > 0)) # 預測
df9_2 = df9_1.loc[(mask3 | mask4)]

# 11.3 save
path_1 = os.path.abspath(os.path.join('data', 'nq', 'prediction', 'nq-prediction.csv'))
if os.path.exists(path_1):
    os.remove(path_1)
df9_3 = df9_2.drop(['udate'], axis=1)
df9_3.to_csv(path_1)

df9_3

Unnamed: 0_level_0,High,Low,Open,Close,Volume,t1,t2,t3,t4,t5
udate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-10-27 17:00:00,11558.50,11559.50,11546.50,11553.50,420.0,,,,,
2020-10-27 17:01:00,11553.50,11557.75,11547.00,11548.75,237.0,,,,,
2020-10-27 17:02:00,11548.75,11551.75,11547.00,11551.75,128.0,,,,,
2020-10-27 17:03:00,11551.75,11553.00,11547.50,11549.75,85.0,,,,,
2020-10-27 17:04:00,11549.75,11551.25,11543.25,11545.50,178.0,,,,,
...,...,...,...,...,...,...,...,...,...,...
2020-11-17 15:50:00,11978.50,11978.75,11977.25,11978.00,61.0,11981.105176,11980.628625,11980.680739,11980.445424,11980.637542
2020-11-17 15:51:00,11978.00,11978.50,11977.50,11977.75,19.0,11980.367762,11979.877878,11979.945567,11979.690793,11979.874701
2020-11-17 15:52:00,11977.75,11977.75,11975.00,11976.25,87.0,11979.530615,11979.026482,11979.106940,11978.833115,11979.011972
2020-11-17 15:53:00,11976.25,11976.25,11971.75,11973.25,105.0,11978.611646,11978.092408,11978.182467,11977.890527,11978.067979
