In [1]:
import tensorflow as tf
import pandas as pd
import sklearn as sk
import matplotlib
import matplotlib.pyplot as plt
import mplfinance as mpf
import talib as talib
import numpy as np
import data as ds
import common as common
import os as os
import math as math
import datetime as dt
import scipy as sp
import itertools  as itertools
import multiprocessing as mp
import joblib
from os import listdir, walk
from pathlib import Path
from itertools import repeat
from mplfinance.original_flavor import candlestick_ohlc
from datetime import datetime, timedelta, date
from tensorflow.keras.layers import LSTM, Dropout, Dense
from tensorflow.keras.optimizers import SGD
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score, pairwise, mean_squared_error, mean_absolute_error
from scipy import stats
from pprint import pprint
import sqlite3 as sqlite3

In [2]:
# 0.1 环境设定
pd.options.mode.chained_assignment = None

# 0.2 日志参数
prefix = 'nq-lstm'
cur_time = datetime.now().strftime("%Y%m%d-%H%M%S")

# 0.3 不让程序占满 GPU 内存
gpus = tf.config.list_physical_devices('GPU')
print(gpus)
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)
    
# 0.4
is_use_sqlite = True

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
# 1.0 重構數據集
def reshape_dataframe(df0, year0):
    df0.fillna(0, inplace=True)
    df0.replace([np.inf, -np.inf], 0, inplace=True)
    df0.columns = ['udate', 'High', 'Low', 'Open', 'Close', 'Volume']
    types2 = {'udate': 'object', 'High': 'float64', 'Low': 'float64', 'Open': 'float64', 'Close': 'float64', 'Volume': 'int64'}
    df0.astype(types2).dtypes
    df0_1 = df0.copy(deep=True)
    error_row = []
    for k, v in df0.iterrows():
        if not pd.isnull(df0['udate'].iloc[k]) and df0['udate'].iloc[k] > 0:
            stime = str(int(df0['udate'].iloc[k]))
            df0_1['udate'].iloc[k] = datetime(year=year0, month=int(stime[-8:-6]), day=int(stime[-6:-4]), hour=int(stime[-4:-2]), minute=int(stime[-2:]), second=0)
            #df0_1['udate'].iloc[k] = datetime(year=year0, month=int(stime[-10:-8]), day=int(stime[-8:-6]), hour=int(stime[-6:-4]), minute=int(stime[-4:-2]), second=int(stime[-2:]))
        else:
            error_row.append(k)
    df0_1.drop(df0_1.index[error_row], inplace=True)
    df0_1.udate = pd.to_datetime(df0_1.udate)
    df0_1.index = pd.to_datetime(df0_1.udate)
    
    # 1.0.1 數據有效性檢查
    for k, v in types2.items():
        if (df0_1[k].isin([np.nan]).any().any()):
            print(k+' obtains nan')
        if (df0_1[k].isin([0]).any().any()):
            print(k+' obtains 0')
    is_contain_null = df0_1.isnull().sum()

    return df0_1
# 1.2.1 year 2020
path_files_1 = []
path_files_2 = []
for j in range(1, 13):
    month1 = str(j).zfill(2)
    file1 = 'nq-20-'+month1+'.csv'
    path_files_1.append(os.path.abspath(os.path.join('data', 'nq', 'data', file1)))
    file2 = 'nq-19-'+month1+'.csv'
    path_files_2.append(os.path.abspath(os.path.join('data', 'nq', 'data', file2)))
    
# 1.3
def data_worker1(m_list1_2, path1_2, year_2):
    try:
        df1_2 = reshape_dataframe(pd.read_csv(path1_2), year_2)
        m_list1_2.append(df1_2)
    except:
        print('No file', path1_2)

# 1.4
if not is_use_sqlite:
    manager1 = mp.Manager()
    m_list1 = manager1.list()
    pool1 = mp.Pool(processes=8, maxtasksperchild=8)
    for path1 in path_files_1:
        pool1.apply_async(func=data_worker1, args=(m_list1, path1, 2020,))
    for path2 in path_files_2:
        pool1.apply_async(func=data_worker1, args=(m_list1, path2, 2019,))
    pool1.close()
    pool1.join()

In [4]:
if not is_use_sqlite:
    # 1.5
    df2 = pd.concat(m_list1, ignore_index=False)

    # 1.6 刪除重覆index
    df2 = df2.groupby(df2.index).first()

    # 1.7 排序
    df2.sort_index(axis=0, ascending=True, inplace=True)
    # df2 = df2.loc[(df2.udate >= datetime(2019, 3, 1, 16, 0, 0))]
    df2

In [5]:
if is_use_sqlite:
    # 1.8 get data
    path_db1 = os.path.abspath(os.path.join('data', 'nq', 'data', 'nq-1m.db'))
    path_db2 = os.path.abspath(os.path.join('data', 'nq', 'data-15s', 'nq-15s.db'))
    db = sqlite3.connect(path_db1)
    cursor = db.cursor()
    stmt1 = "select * from nq where strftime('%S', udate) == '00' order by udate"
    stmt2 = "select * from nq where (strftime('%S', udate) == '00' or strftime('%S', udate) == '15' or strftime('%S', udate) == '30' or strftime('%S', udate) == '45') and udate>='2019-10-17' order by udate limit 1300000"
    df2 = pd.read_sql_query(stmt1, db)
    db.commit()
    cursor.close()
    db.close()

    df2.rename(columns={'high': 'High', 'low': 'Low', 'open': 'Open', 'close': 'Close', 'vol': 'Volume'}, inplace=True)
    types3 = {'udate': 'object', 'High': 'float64', 'Low': 'float64', 'Open': 'float64', 'Close': 'float64', 'Volume': 'int64'}
    df2.astype(types3).dtypes
    df2.udate = pd.to_datetime(df2.udate)
    df2.index = pd.to_datetime(df2.udate)
    df2

In [6]:
# 2.0 分包 (日子)
def separate_daily(df2_1, return_type):
    data1 = {}
    # 2.1交易日
    days1 = list(dict.fromkeys([v.date() for v in df2_1['udate']]))
    for day in days1:
        # 2.2 交易時間
        day_start = datetime(day.year, day.month, day.day, 17, 0, 0)
        day2 = day + timedelta(days=1)
        day_end = datetime(day2.year, day2.month, day2.day, 16, 0, 0)
        mask = ((df2_1['udate'] >= day_start) & (df2_1['udate'] <= day_end))
        df2_2 = df2_1.loc[mask]
        if (df2_2.shape[0] > 1):
            data1[day] = df2_2
    # 2.3 合併
    df2_3 = pd.DataFrame()
    for k, df2_4 in data1.items():
        df2_3 = pd.concat([df2_3, df2_4], axis=0, join='outer', ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, copy=True)
    # 2.4 返回類型
    if return_type=='df':
        return df2_3
    elif return_type=='dict':
        return data1

df2_5 = df2.copy(deep=True)
df3 = separate_daily(df2_5, 'df')
df3

Unnamed: 0_level_0,udate,High,Low,Open,Close,Volume,table_constraints
udate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2019-01-01 17:01:00,2019-01-01 17:01:00,6353.00,6364.75,6351.50,6361.25,296,
2019-01-01 17:02:00,2019-01-01 17:02:00,6361.25,6371.50,6359.25,6368.50,339,
2019-01-01 17:03:00,2019-01-01 17:03:00,6368.50,6371.75,6366.00,6366.25,195,
2019-01-01 17:04:00,2019-01-01 17:04:00,6366.25,6367.50,6364.50,6366.25,154,
2019-01-01 17:05:00,2019-01-01 17:05:00,6366.25,6367.00,6365.75,6366.50,40,
...,...,...,...,...,...,...,...
2020-12-30 21:37:00,2020-12-30 21:37:00,12848.00,12848.00,12848.00,12848.00,2,
2020-12-30 21:38:00,2020-12-30 21:38:00,12848.00,12849.00,12848.00,12848.50,15,
2020-12-30 21:39:00,2020-12-30 21:39:00,12848.50,12850.25,12848.50,12849.25,10,
2020-12-30 21:40:00,2020-12-30 21:40:00,12849.25,12850.00,12848.25,12848.25,10,


In [7]:
# 3.0 技術指標
highs = np.array(df3['High'], dtype='float')
lows = np.array(df3['Low'], dtype='float')
opens = np.array(df3['Open'], dtype='float')
closes = np.array(df3['Close'], dtype='float')
vols = np.array(df3['Volume'], dtype='float')
# 3.1 Bollinger 保力加
df3['upper-band'], df3['middle-band'], df3['lower-band'] = talib.BBANDS(closes, timeperiod=20, nbdevup=2, nbdevdn=2, matype=0)
# 3.2 %B %保力加
df3['%b'] = (df3['Close']-df3['lower-band'])/(df3['upper-band']-df3['lower-band'])*100
df3['%b-high']  = common.percentB_belowzero(df3['%b'], df3['Close']) 
df3['%b-low'] = common.percentB_aboveone(df3['%b'], df3['Close'])
# 2.3 MACD
weight = 1
df3['macd'], df3['macdsignal'], df3['macdhist'] = talib.MACD(closes, fastperiod=12*weight, slowperiod=26*weight, signalperiod=9*weight)
# 2.4 RSI
df3['rsi-2'] = talib.RSI(closes, timeperiod=14*weight)
df3['rsi'] = df3['rsi-2']
df3['rsi'].loc[((df3['rsi'] < 85) & (df3['rsi'] > 25))] = 0
# 2.5 KDJ
df3['k-kdj'], df3['d-kdj'], df3['j-kdj'] = common.kdj(highs, lows, closes, window_size=5)
df3['diff-kdj'] = df3['k-kdj']-df3['d-kdj']
df3['j-kdj'].loc[((df3['j-kdj'] > 20) & (df3['j-kdj'] < 100))] = 0
# 3.6 VWAP 成交量加權平均價格
period = []
for v in period:
    df3['typical-price'] = (df3['High'] + df3['Low'] + df3['Close']) / 3
    df3['turnover'] = df3['typical-price'] * df3['Volume']
    df3['cum-turnover-'+str(v)] = df3['turnover'].rolling(window=v).sum()
    df3['cum-volume-'+str(v)] = df3['Volume'].rolling(window=v).sum()
    df3['vwap-'+str(v)] = df3['cum-turnover-'+str(v)] / df3['cum-volume-'+str(v)]
    df3['vwap-'+str(v)] = df3['vwap-'+str(v)].replace([np.inf, -np.inf], 0)
    df3['vwap-'+str(v)].fillna(0, inplace=True)
    drop_list_1 = ['turnover', 'typical-price', 'cum-turnover-'+str(v), 'cum-volume-'+str(v)]
    df3.drop(drop_list_1, axis=1, inplace=True)
# 3.7 SMA 均線
for v in [10,20,50,100]:
    df3['ema-'+str(v)]= talib.EMA(closes, timeperiod=v)
# 3.8 VOL EMA
df3['vol-ema5'] = talib.EMA(vols, timeperiod=5)
# 3.9 HMA
"""
df3['hma-16']= common.HMA(closes, period=16)
df3['hma-64']= common.HMA(closes, period=64)
df3['hma-100']= common.HMA(closes, period=100)
df3['hma-256']= common.HMA(closes, period=256)
"""
# 3.10 P-SAR 抛物线
df3['p-sar'] = talib.SAR(highs, lows, acceleration=0.02, maximum=0.2)
df3['p-sar'].loc[(df3['p-sar'] < df3['Close'])] = 1
df3['p-sar'].loc[(df3['p-sar'] > df3['Close'])] = -1
df3['p-sar'].loc[(df3['p-sar'] > 1)] = 0
df3

Unnamed: 0_level_0,udate,High,Low,Open,Close,Volume,table_constraints,upper-band,middle-band,lower-band,...,k-kdj,d-kdj,j-kdj,diff-kdj,ema-10,ema-20,ema-50,ema-100,vol-ema5,p-sar
udate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01-01 17:01:00,2019-01-01 17:01:00,6353.00,6364.75,6351.50,6361.25,296,,,,,...,,,,,,,,,,
2019-01-01 17:02:00,2019-01-01 17:02:00,6361.25,6371.50,6359.25,6368.50,339,,,,,...,,,,,,,,,,1.0
2019-01-01 17:03:00,2019-01-01 17:03:00,6368.50,6371.75,6366.00,6366.25,195,,,,,...,,,,,,,,,,1.0
2019-01-01 17:04:00,2019-01-01 17:04:00,6366.25,6367.50,6364.50,6366.25,154,,,,,...,,,,,,,,,,1.0
2019-01-01 17:05:00,2019-01-01 17:05:00,6366.25,6367.00,6365.75,6366.50,40,,,,,...,,,,,,,,,204.800000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-30 21:37:00,2020-12-30 21:37:00,12848.00,12848.00,12848.00,12848.00,2,,12850.075683,12846.9875,12843.899317,...,65.526037,60.560165,0.0,4.965873,12847.598957,12847.086559,12847.337199,12848.421631,24.725653,1.0
2020-12-30 21:38:00,2020-12-30 21:38:00,12848.00,12849.00,12848.00,12848.50,15,,12850.232897,12847.1125,12843.992103,...,64.403230,61.949336,0.0,2.453894,12847.762783,12847.221173,12847.382799,12848.423183,21.483769,1.0
2020-12-30 21:39:00,2020-12-30 21:39:00,12848.50,12850.25,12848.50,12849.25,10,,12850.435095,12847.1875,12843.939905,...,64.999722,62.986355,0.0,2.013367,12848.033186,12847.414394,12847.456022,12848.439555,17.655846,1.0
2020-12-30 21:40:00,2020-12-30 21:40:00,12849.25,12850.00,12848.25,12848.25,10,,12850.535400,12847.2625,12843.989600,...,63.859371,63.539942,0.0,0.319428,12848.072607,12847.493976,12847.487159,12848.435802,15.103897,1.0


In [8]:
# 4.0 draw chart
drop_list_2 = []
is_render_chart = False

def draw_worker(df4_2, day):
    try:
        # 4.1 style
        style = mpf.make_mpf_style(base_mpf_style='charles', rc={'font.size':6})
        # 4.2 addplot
        apds = [mpf.make_addplot(df4_2['lower-band'],panel=0,color='orange',linestyle='solid'),
                mpf.make_addplot(df4_2['upper-band'],panel=0,color='bisque',linestyle='solid'),
                # mpf.make_addplot(df4_2['vwap-50'].replace(0, np.nan),panel=0,color='aqua',linestyle='solid'),
                mpf.make_addplot(df4_2['%b-low'],type='scatter',markersize=20,marker='v',panel=0),
                mpf.make_addplot(df4_2['%b-high'],type='scatter',markersize=20,marker='^',panel=0),
                # mpf.make_addplot(df4_2['p-sar'],scatter=True,markersize=1,marker='*',panel=0,color='blueviolet'),
                #
                mpf.make_addplot(df4_2['vol-ema5'],panel=1,color='orange'),
                #
                mpf.make_addplot(df4_2['macd'],panel=2,color='orange'),
                mpf.make_addplot(df4_2['macdsignal'],panel=2,color='violet'),
                mpf.make_addplot(df4_2['macdhist'],panel=2,type='bar',color='dimgray'),
                #
                mpf.make_addplot(df4_2['k-kdj'],panel=3,color='orange'),
                mpf.make_addplot(df4_2['d-kdj'],panel=3,color='violet'),
                mpf.make_addplot(df4_2['j-kdj'],panel=3,color='aqua'),
                mpf.make_addplot(df4_2['diff-kdj'],panel=3,type='bar',color='dimgray'),
                #
                mpf.make_addplot(df4_2['rsi-2'],panel=4,color='orange'),
                mpf.make_addplot(df4_2['rsi'],panel=4,color='violet')]
        # 4.3 draw
        mpf.plot(df4_2, type='candle', addplot=apds, style=style, ylabel='', ylabel_lower='', volume=True, figscale=0.85, xrotation=0, datetime_format="%H:%M", 
                 show_nontrading=False, tight_layout=True, savefig='./data/img-nq/features/'+day.strftime('%Y-%m-%d'))
        print('finish draw:', df4_2.shape, df4_2['udate'].iloc[0], df4_2['udate'].iloc[-1])
    except:
        print('do not draw:', k, '\n')

# 4.5 multi processing
pool = mp.Pool(processes=8, maxtasksperchild=8)
# 4.6 draw
df4 = df3.copy(deep=True)
data2 = separate_daily(df4, 'dict')
for k, df4_1 in data2.items():
    # 4.7 drop error day
    if df4_1['Volume'].shape[0] == df4_1['Volume'].isin([0]).sum() or df4_1['Volume'].shape[0] <= 100:
        drop_list_2.append(k)
    # 4.8 draw chart
    elif (is_render_chart):
        pool.apply_async(draw_worker, args=(df4_1, k))
    if k.weekday() == 6:
        drop_list_2.append(k)
# 4.9 kill multi processing
pool.close()
pool.join()

drop_list_2.append(date(2019, 1, 20))
drop_list_2.append(date(2019, 5, 26))
drop_list_2.append(date(2019, 7, 3))
drop_list_2.append(date(2019, 7, 28))
drop_list_2.append(date(2019, 8, 4))
drop_list_2.append(date(2019, 8, 25))
drop_list_2.append(date(2019, 11, 27))
drop_list_2.append(date(2020, 3, 8))
drop_list_2.append(date(2020, 3, 15))
drop_list_2.append(date(2020, 3, 17))
drop_list_2.append(date(2020, 5, 24))
drop_list_2.append(date(2020, 9, 6))
drop_list_2.append(date(2020, 11, 25))

for k in drop_list_2:
    data2.pop(k, None)
    # print('excpet & delete: ', k)

In [9]:
# sqlite
df4_3 = df4.copy(deep=True)
df4_3['udate'] = df4_3['udate'].dt.strftime('%Y-%m-%d %H:%M:%S')
df4_3.drop(df4_3.columns.difference(['udate', 'High', 'Low', 'Open', 'Close', 'Volume']), 1, inplace=True)
data4_1 = [tuple(x) for x in df4_3.values]

# 储存
path_db = os.path.abspath(os.path.join('data', 'nq', 'data-15s', 'nq-15s.db'))
path_db = os.path.abspath(os.path.join('data', 'nq', 'data', 'nq-1m.db'))
db = sqlite3.connect(path_db)
cursor = db.cursor()
#cursor.executemany('REPLACE INTO nq (udate, high, low, open, close, vol) VALUES (?, ?, ?, ?, ?, ?)', data4_1)
db.commit()
cursor.close()
db.close()

len(data4_1)

592470

In [10]:
# 5.0 合併
df5 = pd.DataFrame()
for k, df5_1 in data2.items():
    df5 = pd.concat([df5, df5_1], axis=0, join='outer', ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, copy=True)

# 5.1 清洗
df5_3 = df5.copy(deep=True)
drop_list_3 = ['High', 'Low', 'Open', 'Volume', 'macd', 'macdsignal', 'upper-band', 'lower-band', 'middle-band', 'k-kdj', 'd-kdj', 'vol-ema5', 'rsi-2']
df5_3.drop(drop_list_3, axis=1, inplace=True)
df5_3.fillna(0, inplace=True)
df5_3 = df5_3.round(2)

# 5.2 檢查
is_contain_null = df5_3.isnull().sum()
is_contain_nan = df5_3.isna().sum()
is_contain_inf = df5_3.isin([np.nan]).sum()
print('数据集: Total dataset has {} samples, and {} features.'.format(df5_3.shape[0], df5_3.shape[1])) # df5_3.info()

# 5.3 儲存
path_data = os.path.abspath(os.path.join('data', 'nq', 'clean-data', 'nq-clean-data-with-features.csv'))
if os.path.exists(path_data):
    os.remove(path_data)
df5_3.to_csv(path_data)

# 5.4.1 分包 (日子)
data3 = separate_daily(df5_3, 'dict')
# 5.3.2 正則化
normalization_days = 10
days2 = np.array(list(data3.keys()))
len_days2 = (days2.shape[0]//normalization_days)+1
data3_1 = []
scalers = {}
for i in range(len_days2, 0, -1):
    try:
        end1 = (i*normalization_days)-(normalization_days-(days2.shape[0]%normalization_days))-1
        start1 = end1-normalization_days
        if start1 <= 0:
            break # 不足够10天，退出
        # 5.3.3 每X日集合
        day_start, day_end = days2[start1], days2[end1]
        day_start2 = datetime(day_start.year, day_start.month, day_start.day, 17, 0, 0)
        day_end2 = datetime(day_end.year, day_end.month, day_end.day, 16, 0, 0)
        mask = ((df5_3['udate'] >= day_start2) & (df5_3['udate'] <= day_end2))
        df5_4 = df5_3[mask]
        print('{} ~ {}  开始：{}   终结：{}'.
              format(start1, end1, df5_4.iloc[0]['udate'].strftime('%Y-%m-%d %H:%M:%S'), df5_4.iloc[-1]['udate'].strftime('%Y-%m-%d %H:%M:%S')))
        # 5.4.4 正則代集合
        min_max_scaler = MinMaxScaler(feature_range=(0, 1))
        df5_5 = df5_4.drop(['udate'], axis=1)
        min_max_scaler.fit(df5_5)
        scalers[i] = {'scaler': min_max_scaler, 'day_start': day_start2, 'day_end': day_end2}
        df5_6 = min_max_scaler.transform(df5_5)
        df5_6 = pd.DataFrame(df5_6, columns=df5_5.columns, index=df5_5.index)
        df5_6['udate'] = df5_4['udate']
        # 5.4.5 合併
        data3_1.append(df5_6)
    except:
        print('except: ', start1, end1, len(days2))
        
# 5.4.6 save mixmax scaler
path_name_12 = os.path.abspath(os.path.join('min_max_scaler', prefix+'-'+cur_time+'.pkl'))
if os.path.exists(path_name_12):
    os.remove(path_name_12)
joblib.dump(scalers, path_name_12) 

# 5.4.7 合併
df5_7 = pd.DataFrame()
for df5_8 in data3_1:
    df5_7 = pd.concat([df5_7, df5_8], axis=0, join='outer', ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, copy=True)
df5_7.sort_index(inplace=True)
df5_7

数据集: Total dataset has 465730 samples, and 14 features.
337 ~ 347  开始：2020-12-10 17:00:00   终结：2020-12-30 16:00:00
327 ~ 337  开始：2020-11-23 17:00:00   终结：2020-12-10 16:00:00
317 ~ 327  开始：2020-11-04 17:00:00   终结：2020-11-20 15:59:00
307 ~ 317  开始：2020-10-19 17:00:00   终结：2020-11-04 16:00:00
297 ~ 307  开始：2020-09-30 17:01:00   终结：2020-10-16 15:59:00
287 ~ 297  开始：2020-09-10 17:00:00   终结：2020-09-30 16:00:00
277 ~ 287  开始：2020-08-25 17:00:00   终结：2020-09-10 16:00:00
267 ~ 277  开始：2020-08-06 17:00:00   终结：2020-08-25 16:00:00
257 ~ 267  开始：2020-07-21 17:00:00   终结：2020-08-06 16:00:00
247 ~ 257  开始：2020-07-02 17:00:00   终结：2020-07-21 16:00:00
237 ~ 247  开始：2020-06-16 17:00:00   终结：2020-07-02 16:00:00
227 ~ 237  开始：2020-05-28 17:00:00   终结：2020-06-16 16:00:00
217 ~ 227  开始：2020-05-12 17:00:00   终结：2020-05-28 16:00:00
207 ~ 217  开始：2020-04-23 17:00:00   终结：2020-05-12 16:00:00
197 ~ 207  开始：2020-04-06 17:00:00   终结：2020-04-23 16:00:00
187 ~ 197  开始：2020-03-18 17:00:00   终结：2020-04-03 15:59:00


Unnamed: 0_level_0,Close,%b,%b-high,%b-low,macdhist,rsi,j-kdj,diff-kdj,ema-10,ema-20,ema-50,ema-100,p-sar,udate
udate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2019-01-17 17:00:00,0.296528,0.166674,0.956982,0.000000,0.410256,0.0,0.420826,0.285570,0.300165,0.293540,0.270027,0.241267,0.0,2019-01-17 17:00:00
2019-01-17 17:01:00,0.291406,0.153746,0.000000,0.000000,0.366778,0.0,0.420826,0.237530,0.296705,0.291728,0.269539,0.241241,0.0,2019-01-17 17:01:00
2019-01-17 17:02:00,0.287991,0.196932,0.000000,0.000000,0.335563,0.0,0.418199,0.197379,0.293222,0.289752,0.268906,0.241115,0.0,2019-01-17 17:02:00
2019-01-17 17:03:00,0.286283,0.237431,0.000000,0.000000,0.318841,0.0,0.383280,0.168021,0.290040,0.287799,0.268248,0.240963,0.0,2019-01-17 17:03:00
2019-01-17 17:04:00,0.287422,0.287290,0.000000,0.956618,0.321070,0.0,0.358950,0.150347,0.287672,0.286151,0.267638,0.240861,0.0,2019-01-17 17:04:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-30 15:56:00,0.889216,0.409900,0.000000,0.000000,0.430208,0.0,0.391362,0.617999,0.902353,0.905430,0.911219,0.915492,0.0,2020-12-30 15:56:00
2020-12-30 15:57:00,0.888848,0.402247,0.000000,0.000000,0.423765,0.0,0.391362,0.568458,0.901780,0.905171,0.911141,0.915412,0.0,2020-12-30 15:57:00
2020-12-30 15:58:00,0.889952,0.440073,0.000000,0.000000,0.424481,0.0,0.391362,0.502927,0.901523,0.905034,0.911110,0.915348,0.0,2020-12-30 15:58:00
2020-12-30 15:59:00,0.891056,0.477849,0.000000,0.000000,0.429492,0.0,0.391362,0.439056,0.901508,0.905019,0.911126,0.915332,0.0,2020-12-30 15:59:00


In [11]:
# 檢查 - 时间拦截器
df5_71 = df5_7.loc[(df5_7['udate'] >= datetime(2020, 11, 4, 5, 0, 0)) & (df5_7['udate'] <= datetime(2020, 11, 4, 7, 0, 0))]
for k, v in df5_71.iterrows():
    pass

In [12]:
# 5.5.1 訓練集, 測試集, 數據集
df5_8 = df5_7.copy(deep=True)

is_raw_data = False
if is_raw_data:
    drop_list_5 = df5_8.columns.values.tolist()
    drop_list_5.remove('Close')
    drop_list_5.remove('udate')
    df5_8.drop(drop_list_5, axis=1, inplace=True)

data4 = separate_daily(df5_8, 'dict')
days3 = np.array(list(data4.keys()))

# train_set, test_set, valid_set = days3[0:-36], days3[-36:-11], days3[-11:-1] # for 15s
train_set, test_set, valid_set = days3[0:-70], days3[-70:-30], days3[-30:] # 0921~1113
#train_set, test_set, valid_set = days3[0:-76], days3[-76:-33], days3[-33:-7] # 0921~1113
# train_set, test_set, valid_set = days3[0:-63], days3[-63:-21], days3[-21:-2] # 1012-1113
#train_set, test_set, valid_set = days3[0:-110], days3[-110:-74], days3[-74:-32] # 0608~0921
#train_set, test_set, valid_set = days3[0:-44], days3[-44:-1], days3[-1:-1] # 0921~1113
x_train, y_train, date_train = np.array([]), np.array([]), []
x_test, y_test, date_test = np.array([]), np.array([]), []
x_valid, y_valid, date_valid = np.array([]), np.array([]), []
len(days3), len(train_set), len(test_set), len(valid_set)

(340, 264, 43, 26)

In [13]:
# 5.5.2 窗口步長
t_pus_no = 5
window_size = 50
is_y_label_m_to_n = True

for day, df6 in data4.items():
    df6.drop(['udate'], axis=1, inplace=True)
    # 5.5.3 窗口
    no_max = df6.shape[0]-t_pus_no
    x_data, y_data = [], []
    for i in range(window_size, no_max):
        start, end = i-window_size, i
        # y label
        if is_y_label_m_to_n:
            temp_0 = df6.iloc[end: end+t_pus_no] # t1 ~ t5
            temp_2 = df6.iloc[end-1: end+t_pus_no-1] # current time
        else:
            temp_0 = df6.iloc[end+t_pus_no: end+t_pus_no] # t5
            temp_2 = df6.iloc[end-t_pus_no-1: end-t_pus_no-1] # current time
        y_data.append(temp_0['Close'])
        # x matrix
        temp_1 = df6.iloc[start: end]
        x_data.append(temp_1)
        # date
        days4 = temp_2.index.tolist()
        if day in train_set:
            date_train.append(days4)
        elif day in test_set:
            date_test.append(days4)
        elif day in valid_set:
            date_valid.append(days4)
    x_data, y_data = np.array(x_data), np.array(y_data)
    # print(day, x_data.shape, y_data.shape)
    # 5.5.4 分集1
    if day in train_set and x_train.any() and y_train.any():
        x_train = np.concatenate((x_train, x_data), axis=0)
        y_train = np.concatenate((y_train, y_data), axis=0)
    elif day in test_set and x_test.any() and y_test.any():
        x_test = np.concatenate((x_test, x_data), axis=0)
        y_test = np.concatenate((y_test, y_data), axis=0)
    elif day in valid_set and x_valid.any() and y_valid.any():
        x_valid = np.concatenate((x_valid, x_data), axis=0)
        y_valid = np.concatenate((y_valid, y_data), axis=0)
    # 5.5.5 分集2
    if day in train_set and not x_train.any() and not y_train.any():
        x_train, y_train = x_data, y_data
    elif day in test_set and not x_test.any() and not y_test.any():
        x_test, y_test = x_data, y_data
    elif day in valid_set and not x_valid.any() and not y_valid.any():
        x_valid, y_valid = x_data, y_data

# no_batches, timesteps, no_features
print('訓練集: X_train Data: {}, Y_train Data: {}, Date_Train: {}'.format(x_train.shape, y_train.shape, np.array(date_train).shape))
print('測試集: X_Test Data: {}, Y_Test Data: {}, Date_Test: {}'.format(x_test.shape, y_test.shape, np.array(date_test).shape))
print('验证集: X_Valid Data: {}, Y_Valid Data: {}, Date_Valid: {}'.format(x_valid.shape, y_valid.shape, np.array(date_valid).shape))

訓練集: X_train Data: (338144, 50, 13), Y_train Data: (338144, 5), Date_Train: (338144, 5)
測試集: X_Test Data: (56361, 50, 13), Y_Test Data: (56361, 5), Date_Test: (56361, 5)
验证集: X_Valid Data: (33868, 50, 13), Y_Valid Data: (33868, 5), Date_Valid: (33868, 5)


In [14]:
# 6.1.1 模型参数
batch_size = 256
epochs = 48
units = 512
verbose = 1
learning_rate = 0.0005 # 0.001
no_batches = x_train.shape[0]
timesteps = x_train.shape[1]
no_features = x_train.shape[2]
batch_input_shape = (timesteps, no_features)

# 6.2 模型 
# activation=softsign/tanh
model = tf.keras.Sequential()
model.add(LSTM(units=units, recurrent_activation='sigmoid', activation='tanh', unroll=False, use_bias=True, 
               recurrent_dropout=0, return_sequences=True, input_shape=batch_input_shape))
model.add(Dropout(rate=0.2))

model.add(LSTM(units=units, activation='tanh',return_sequences=True, input_shape=batch_input_shape))
model.add(Dropout(rate=0.1))

model.add(LSTM(units=units, activation='tanh',return_sequences=True, input_shape=batch_input_shape))
model.add(Dropout(rate=0.2))

model.add(LSTM(units=units, activation='tanh',return_sequences=True, input_shape=batch_input_shape))
model.add(Dropout(rate=0.1))

model.add(LSTM(units=units, activation='tanh',return_sequences=False, input_shape=batch_input_shape))
model.add(Dropout(rate=0.2))

if is_y_label_m_to_n:
    model.add(Dense(units=t_pus_no))
else:
    model.add(Dense(units=1))


opt = tf.keras.optimizers.Adam(learning_rate=learning_rate)
model.compile(optimizer=opt, loss='mean_squared_error', metrics=['accuracy'])
model.summary()

# 6.3.1 check point
checkpoint_dir = './training_checkpoints/'+ prefix +'-' + cur_time
os.mkdir(checkpoint_dir)
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

# 6.3.2 tensor board
log_dir = os.path.join('./logs/fit/'+ prefix +'-') + cur_time
os.mkdir(log_dir)
tensor_board_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

# 6.4 fit model
history_model = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test), shuffle=True, verbose=verbose) # callbacks=[checkpoint_callback, tensor_board_callback]

# 6.5 save model
model_path = "./saved_model/"+prefix+"-"+cur_time+".h5"
model.save(model_path)


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 50, 512)           1077248   
_________________________________________________________________
dropout (Dropout)            (None, 50, 512)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 50, 512)           2099200   
_________________________________________________________________
dropout_1 (Dropout)          (None, 50, 512)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 50, 512)           2099200   
_________________________________________________________________
dropout_2 (Dropout)          (None, 50, 512)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 50, 512)           2

In [15]:
# 7.1 visualize loss
keys = list(history_model.history.keys())
training_loss = history_model.history['loss']
test_loss = history_model.history['val_loss']
epoch_count = range(1, len(training_loss) + 1)
plt.plot(epoch_count, training_loss, 'r--')
plt.plot(epoch_count, test_loss, 'b-')
plt.legend(['Train', 'Test'])
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('model:'+prefix+'   lr:'+str(learning_rate)+'   epochs:'+str(epochs)+'   units:'+str(units)+'   bs:'+str(batch_size) ,loc ='left')
plt.tight_layout()
plt.grid()
plt.savefig('./data/img-nq/results/'+cur_time+'-loss')
plt.clf()

# 7.2 visualize accuracy
training_accuracy = history_model.history['accuracy']
test_accuracy = history_model.history['val_accuracy']
epoch_count = range(1, len(training_accuracy) + 1)
plt.plot(epoch_count, training_accuracy, 'r--')
plt.plot(epoch_count, test_accuracy, 'b-')
plt.legend(['Train', 'Test'])
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('model:'+prefix+'   lr:'+str(learning_rate)+'   epochs:'+str(epochs)+'   units:'+str(units)+'   bs:'+str(batch_size), loc ='left')
plt.tight_layout()
plt.grid()
plt.savefig('./data/img-nq/results/'+cur_time+'-accuracy')
plt.clf()

<Figure size 432x288 with 0 Axes>

In [16]:
# 8.0 model evaluate
verbose = 1
train_score = model.evaluate(x_train, y_train, verbose=verbose)
test_score = model.evaluate(x_test, y_test, verbose=verbose)
valid_score = model.evaluate(x_valid, y_valid, verbose=verbose)

print('Train Score: %.4f MSE (%.4f RMSE)' % (train_score[0], math.sqrt(train_score[0])))
print('Test Score: %.4f MSE (%.4f RMSE)' % (test_score[0], math.sqrt(test_score[0])))
print('Validate Score: %.4f MSE (%.4f RMSE)' % (valid_score[0], math.sqrt(valid_score[0])))

Train Score: 0.0001 MSE (0.0101 RMSE)
Test Score: 0.0001 MSE (0.0095 RMSE)
Validate Score: 0.0001 MSE (0.0119 RMSE)


In [17]:
# 9.0 預測
predict1 = model.predict(x_valid)
if is_y_label_m_to_n:
    columns1=['t1', 't2', 't3', 't4', 't5']
else:
    columns1=['predict_close']
df7 = pd.DataFrame(predict1, columns=columns1)
date_valid2 = [v[0] for v in date_valid]
df7['udate'] = date_valid2
df7.index = df7['udate']
df7

Unnamed: 0_level_0,t1,t2,t3,t4,t5,udate
udate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-10-29 17:49:00,0.277749,0.277021,0.277120,0.276765,0.277090,2020-10-29 17:49:00
2020-10-29 17:50:00,0.278296,0.277582,0.277685,0.277335,0.277662,2020-10-29 17:50:00
2020-10-29 17:51:00,0.278376,0.277668,0.277775,0.277431,0.277755,2020-10-29 17:51:00
2020-10-29 17:52:00,0.278467,0.277759,0.277869,0.277528,0.277847,2020-10-29 17:52:00
2020-10-29 17:53:00,0.278176,0.277469,0.277581,0.277244,0.277559,2020-10-29 17:53:00
...,...,...,...,...,...,...
2020-12-16 15:50:00,0.653272,0.653718,0.653667,0.654136,0.654077,2020-12-16 15:50:00
2020-12-16 15:51:00,0.651202,0.651646,0.651599,0.652069,0.652007,2020-12-16 15:51:00
2020-12-16 15:52:00,0.650583,0.651020,0.650972,0.651442,0.651377,2020-12-16 15:52:00
2020-12-16 15:53:00,0.654655,0.655080,0.655016,0.655487,0.655421,2020-12-16 15:53:00


In [18]:
# 10.0 
len_shape_y = df5_7.shape[1]-2
fill_list = list(repeat(0, len_shape_y))
df8 = pd.DataFrame()

# 10.1 逆向
for k, v in scalers.items():
    mask = ((df7['udate'] >= v['day_start']) & (df7['udate'] <= v['day_end']))
    df8_1 = df7.loc[mask]
    df8_2 = df8_1.drop(['udate'], axis=1)
    # 10.2
    data4 = []
    for k1, v1 in df8_2.iterrows():
        data4_1 = []
        for v2 in v1:
            data4_1.append([v2] + fill_list)
        data4_2 = v['scaler'].inverse_transform(data4_1)
        data4_3 = [v[0] for v in data4_2]
        data4.append(data4_3)
    # 10.3
    df8_3 = pd.DataFrame(data4, columns=columns1)
    df8_3.index = df8_1['udate']
    # 10.4 合併
    if df8_3.shape[0] > 0:
        df8 = pd.concat([df8, df8_3], axis=0, join='outer', ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, copy=True)

df8['udate'] = df8.index.values
df8 = df8[['udate'] + columns1]
# df8[df8.index.duplicated()]
df8

Unnamed: 0_level_0,udate,t1,t2,t3,t4,t5
udate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-12-10 17:49:00,2020-12-10 17:49:00,12402.083050,12401.519215,12401.604135,12401.338787,12401.555592
2020-12-10 17:50:00,2020-12-10 17:50:00,12398.951848,12398.404430,12398.502893,12398.239387,12398.463055
2020-12-10 17:51:00,2020-12-10 17:51:00,12400.781916,12400.209944,12400.295694,12400.032876,12400.244742
2020-12-10 17:52:00,2020-12-10 17:52:00,12398.196208,12397.639621,12397.737436,12397.474699,12397.695654
2020-12-10 17:53:00,2020-12-10 17:53:00,12399.216973,12398.643968,12398.733443,12398.470160,12398.684090
...,...,...,...,...,...,...
2020-11-04 15:50:00,2020-11-04 15:50:00,11835.715531,11836.686279,11836.341518,11837.033913,11837.026038
2020-11-04 15:51:00,2020-11-04 15:51:00,11835.104284,11836.080193,11835.737774,11836.426817,11836.418356
2020-11-04 15:52:00,2020-11-04 15:52:00,11835.760229,11836.734169,11836.386269,11837.078132,11837.076376
2020-11-04 15:53:00,2020-11-04 15:53:00,11829.155550,11830.144975,11829.819424,11830.497664,11830.478614


In [19]:
# 11.1
df9 = df3.drop(list(df3.columns)[6:], axis=1)
df9_1 = pd.concat([df9, df8], axis=1)

# 11.2 
start2, end2 =  date_valid2[0], date_valid2[-2:-1][0]
start3 = start2 - timedelta(minutes=t_pus_no+window_size)
mask3 = ((df9_1.index >= start3) & (df9_1.index <= start2)) # 窗口步長
if is_y_label_m_to_n:
    k9 = 't1'
else:
    k9 = 'predict_close'
mask4 = ((df9_1.index >= start2) & (df9_1.index <= end2)) # 預測
df9_2 = df9_1.loc[(mask3 | mask4)]
df9_3 = df9_2.drop(['udate'], axis=1)

path_name = ''
if is_raw_data:
    path_name = '-raw'
    
if is_y_label_m_to_n:
    path_name_2 = 't'+str(t_pus_no)
else:
    path_name_2 = str(t_pus_no)+'mins'

# 11.3.1 save1
mask5 = ((df9_3.index >= start3) & (df9_3.index <= start2)) # 窗口步長
mask6 = ((df9_3.index >= start2) & (df9_3.index <= end2) & (df9_3[k9] > 0)) # 預測
df9_4 = df9_3.loc[(mask5 | mask6)]

path_4 = os.path.abspath(os.path.join('data', 'nq', 'prediction', cur_time))
if not os.path.exists(path_4):
    os.mkdir(path_4)

start9, end9 = pd.to_datetime(df9_4.index.values[0]), pd.to_datetime(df9_4.index.values[-1])
path_1_date = str(start9.month).zfill(2)+str(start9.day).zfill(2)+'-'+str(end9.month).zfill(2)+str(end9.day).zfill(2)
path_1 = os.path.abspath(os.path.join('data', 'nq', 'prediction', cur_time, path_name_2+'-'+path_1_date+'-e'+ str(epochs) +'-u'+ str(units) + '-w' + str(window_size) + '-lr' + str(learning_rate) + path_name+ '.csv'))
if os.path.exists(path_1):
    os.remove(path_1)
df9_4.to_csv(path_1)

# 11.3.2 save3
path_3 = os.path.abspath(os.path.join('data', 'nq', 'prediction', 'nq-prediction.csv'))
if os.path.exists(path_3):
    os.remove(path_3)
df9_4.to_csv(path_3)

# 11.4 save2
days9 = [[datetime(2020, 7, 6, 17, 0, 0), datetime(2020, 7, 10, 16, 0, 0)],
         [datetime(2020, 7, 13, 17, 0, 0), datetime(2020, 7, 17, 16, 0, 0)],
         [datetime(2020, 7, 20, 17, 0, 0), datetime(2020, 7, 24, 16, 0, 0)],
         [datetime(2020, 7, 27, 17, 0, 0), datetime(2020, 7, 31, 16, 0, 0)],
         #
         [datetime(2020, 8, 31, 17, 0, 0), datetime(2020, 9, 4, 16, 0, 0)],
         [datetime(2020, 8, 7, 17, 0, 0), datetime(2020, 9, 11, 16, 0, 0)],
         [datetime(2020, 8, 14, 17, 0, 0), datetime(2020, 9, 18, 16, 0, 0)],
         [datetime(2020, 9, 21, 17, 0, 0), datetime(2020, 9, 25, 16, 0, 0)],
         [datetime(2020, 9, 28, 17, 0, 0), datetime(2020, 10, 2, 16, 0, 0)],
         #
         [datetime(2020, 10, 5, 17, 0, 0), datetime(2020, 10, 9, 16, 0, 0)],
         [datetime(2020, 10, 12, 17, 0, 0), datetime(2020, 10, 16, 16, 0, 0)],
         [datetime(2020, 10, 19, 17, 0, 0), datetime(2020, 10, 23, 16, 0, 0)],
         [datetime(2020, 10, 26, 17, 0, 0), datetime(2020, 10, 30, 16, 0, 0)],
         #
         [datetime(2020, 11, 2, 17, 0, 0), datetime(2020, 11, 6, 16, 0, 0)],
         [datetime(2020, 11, 9, 17, 0, 0), datetime(2020, 11, 13, 16, 0, 0)],
         [datetime(2020, 11, 16, 17, 0, 0), datetime(2020, 11, 20, 16, 0, 0)],
         [datetime(2020, 11, 23, 17, 0, 0), datetime(2020, 11, 27, 16, 0, 0)],
         #
         [datetime(2020, 11, 30, 17, 0, 0), datetime(2020, 12, 1, 16, 0, 0)],
         [datetime(2020, 12, 4, 17, 0, 0), datetime(2020, 12, 8, 16, 0, 0)],
         [datetime(2020, 12, 11, 17, 0, 0), datetime(2020, 12, 15, 16, 0, 0)],
         [datetime(2020, 12, 18, 17, 0, 0), datetime(2020, 12, 22, 16, 0, 0)],
         [datetime(2020, 12, 28, 17, 0, 0), datetime(2021, 1, 1, 16, 0, 0)]]
for v9 in days9:
    df9_5 = df9_3.copy(deep=True)
    start4 = v9[0] + timedelta(minutes=t_pus_no+window_size) # 窗口步長
    mask7 = ((df9_5.index >= v9[0]) & (df9_5.index <= start4)) # 窗口步長
    mask8 = ((df9_5.index >= start4) & (df9_5.index <= v9[1]) & (df9_5[k9] > 0)) # 預測
    df9_6 = df9_5.loc[mask7 | mask8]
    file_name_1 = path_name_2+'-'+ str(v9[0].month).zfill(2) + str(v9[0].day).zfill(2) +'-'+ str(v9[1].month).zfill(2) + str(v9[1].day).zfill(2) + '-e' + str(epochs) +'-u'+ str(units) + '-w' + str(window_size) + '-lr' + str(learning_rate) + path_name + '.csv'
    path_2 = os.path.abspath(os.path.join('data', 'nq', 'prediction', cur_time, file_name_1))
    if os.path.exists(path_2):
        os.remove(path_2)
    if df9_6.shape[0] > 1:
        df9_6.to_csv(path_2)
        print(file_name_1)

df9_4

t5-1026-1030-e48-u512-w50-lr0.0005.csv
t5-1102-1106-e48-u512-w50-lr0.0005.csv
t5-1109-1113-e48-u512-w50-lr0.0005.csv
t5-1116-1120-e48-u512-w50-lr0.0005.csv
t5-1123-1127-e48-u512-w50-lr0.0005.csv
t5-1130-1201-e48-u512-w50-lr0.0005.csv
t5-1204-1208-e48-u512-w50-lr0.0005.csv
t5-1211-1215-e48-u512-w50-lr0.0005.csv


Unnamed: 0_level_0,High,Low,Open,Close,Volume,t1,t2,t3,t4,t5
udate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2020-10-29 17:00:00,11213.25,11229.75,11205.75,11222.75,699,,,,,
2020-10-29 17:01:00,11222.75,11226.75,11215.75,11221.50,315,,,,,
2020-10-29 17:02:00,11221.50,11223.25,11207.00,11213.50,477,,,,,
2020-10-29 17:03:00,11213.50,11217.50,11208.50,11215.75,217,,,,,
2020-10-29 17:04:00,11215.75,11219.25,11212.00,11218.00,111,,,,,
...,...,...,...,...,...,...,...,...,...,...
2020-12-16 15:49:00,12678.00,12679.00,12677.50,12678.00,23,12680.834397,12681.134685,12681.095738,12681.414852,12681.374082
2020-12-16 15:50:00,12678.00,12680.25,12677.00,12677.00,98,12679.735190,12680.038191,12680.003291,12680.322082,12680.281595
2020-12-16 15:51:00,12677.00,12677.00,12675.00,12676.00,42,12678.328853,12678.630558,12678.598897,12678.918012,12678.876068
2020-12-16 15:52:00,12676.00,12676.25,12675.25,12675.75,30,12677.908320,12678.205207,12678.172656,12678.492013,12678.448045


In [20]:
# 12.1 evaluate
if is_y_label_m_to_n:
    predict_k12 = 't1'
else:
    predict_k12 = 'predict_close'

df12 = df9_3.copy(deep=True)
mask12 = ((df12[predict_k12] > 0) & (df12['Close'] > 0))
df12 = df12.loc[mask12].round(2)

# 
stats_1 = stats.describe(df12['Close'].values)
stats_2 = stats.describe(df12[predict_k12].values)
# R平方, 均方误差, 均方根误差, 平均绝对误差, 平均絕對百分比誤差
r2 = r2_score(df12['Close'].values, df12[predict_k12].values)
mse = mean_squared_error(df12['Close'].values, df12[predict_k12].values)
rmse = np.sqrt(mean_squared_error(df12['Close'].values, df12[predict_k12].values))
mae =  mean_absolute_error(df12['Close'].values, df12[predict_k12].values)
ampe = np.mean(np.abs((df12['Close']-df12[predict_k12])/df12['Close']))
columns2 = ['sd', 'r2', 'mse', 'rmse', 'mae', 'ampe']
result1 = [r2, mse, rmse, mae, ampe]
# 標準差
std_1, std_2 = [np.std(df12['Close'].values)], [np.std(df12[predict_k12].values)]
# 最少值, 四分位數, 最大值
columns3 = ['min', 'q25%', 'q50%', 'q75%', 'max']
result2 = df12['Close'].quantile([.0, .25, .5, .75, 1]).values.tolist()
result3 = df12[predict_k12].quantile([.0, .25, .5, .75, 1]).values.tolist()

# 合併
df12_1 = pd.DataFrame([list(stats_1)+std_1+result1+result2, list(stats_2)+std_2+result1+result3], columns=list(stats_1._fields)+columns2+columns3, index=['real', 'predict'])
df12_1.drop(['minmax'], axis=1, inplace=True)

# 13.4 save
path_13 = os.path.abspath(os.path.join('data', 'nq', 'result-describe', path_name_2+'-'+path_1_date+'-e'+ str(epochs) +'-u'+ str(units) + '-w' + str(window_size) + '-lr' + str(learning_rate) + path_name+ '-' + cur_time +'.csv'))
df12_1.to_csv(path_13)
df12_1

Unnamed: 0,nobs,mean,variance,skewness,kurtosis,sd,r2,mse,rmse,mae,ampe,min,q25%,q50%,q75%,max
real,33867,12078.525357,162627.502777,-0.685464,0.053297,403.26505,0.999955,7.363504,2.713578,2.334208,0.000193,10948.75,11871.875,12038.25,12437.5,12702.25
predict,33867,12080.646143,162813.379795,-0.687414,0.053756,403.495443,0.999955,7.363504,2.713578,2.334208,0.000193,10963.58,11874.57,12040.66,12440.39,12705.96
