In [101]:
import tensorflow as tf
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import mplfinance as mpf
import talib as talib
import numpy as np
import data as ds
import common as common
import os as os
import datetime as datetime

from mplfinance.original_flavor import candlestick_ohlc
from datetime import datetime, timedelta
from tensorflow.keras.layers import Dropout

In [102]:
# 环境设定
pd.options.mode.chained_assignment = None
# 不让程序占满 GPU 内存
gpus = tf.config.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [103]:
# 数据源 v1
file_1 = os.path.abspath(os.path.join('data', 'nq', '20201020_064100.csv'))
file_2 = os.path.abspath(os.path.join('data', 'nq', '20201023_064200.csv'))
df1 = pd.read_csv(file_1)
df2 = pd.read_csv(file_2)

# 数据源 v2
file_3 = os.path.abspath(os.path.join('data', 'nq', 'nq-20201029.csv'))
_df1 = pd.read_csv(file_3)

In [104]:
# 1.0 reshapre dataframe v1 重構數據集
df1.index = pd.to_datetime(df1.stime)
df2.index = pd.to_datetime(df2.stime)
df3 = pd.concat([df1, df2], axis=0, join='outer', ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, copy=True)
df3 = df3.rename(columns={"high": "High", "low": "Low", "open": "Open", "last": "Close", "vol": "Accumulated Volume"})
df3['udate'] = pd.to_datetime(df3['udate'])
df3['interactive_udate'] = pd.to_datetime(df3['interactive_udate'])
df3['mdate'] = pd.to_datetime(df3['mdate'])
df3['stime'] = pd.to_datetime(df3['stime'])
shape = df3.shape
types = df3.dtypes
types1 = {'High': 'float64', 'Low': 'float64', 'Open': 'float64', 'Close': 'float64', 'Accumulated Volume': 'int64', 'chng': 'float64', 'pchng': 'float64'}
df3.astype(types1).dtypes

# 1.0 reshapre dataframe v2 重構數據集
_df1.columns = ['udate', 'High', 'Low', 'Open', 'Close', 'Volume']
types2 = {'udate': 'object', 'High': 'float64', 'Low': 'float64', 'Open': 'float64', 'Close': 'float64', 'Volume': 'int64'}
_df1.astype(types2).dtypes
_df2 = _df1.copy(deep=True)
error_row = []
for k, v in _df1.iterrows():
    if not pd.isnull(_df1['udate'].iloc[k]):
        stime = str(int(_df1['udate'].iloc[k]))
        _df2['udate'].iloc[k] = datetime(year=2020, month=int(stime[-8:-6]), day=int(stime[-6:-4]), hour=int(stime[-4:-2]), minute=int(stime[-2:]), second=0)
    else:
        error_row.append(k)
_df2.drop(_df2.index[error_row], inplace=True)
_df2.udate = pd.to_datetime(_df2.udate)
_df2.index = pd.to_datetime(_df2.udate)
df3 = _df2.copy(deep=True)

# 1.2 zero / nan 數據有效性檢查
for k, v in types2.items():
    if (df3[k].isin([np.nan]).any().any()):
        print(k+' obtains nan')
    if (df3[k].isin([0]).any().any()):
        print(k+' obtains 0')
# 1.3 overview
is_contain_nan = df3.isnull().sum()

Volume obtains 0


In [105]:
# 2.0 technical indicator
highs = np.array(df3['High'], dtype='float')
lows = np.array(df3['Low'], dtype='float')
opens = np.array(df3['Open'], dtype='float')
closes = np.array(df3['Close'], dtype='float')
# 2.1 SMA 均線
for v in [5, 10, 20, 50, 80, 100, 120, 150, 180, 200]:
    df3['sma-'+str(v)]= talib.SMA(closes, timeperiod=v)
# 2.2 Bollinger 保力加
df3['upper-band'], df3['middle-band'], df3['lower-band'] = talib.BBANDS(closes, timeperiod=20, nbdevup=2, nbdevdn=2, matype=0)
# 2.3 %B %保力加
df3['%b'] = (df3['Close']-df3['lower-band'])/(df3['upper-band']-df3['lower-band'])*100
# 2.4 P-SAR 抛物线
df3['p-sar'] = talib.SAR(highs, lows, acceleration=0.02, maximum=0.2)

In [106]:
# 3.0 separate to daily data set
data = {}
days1 = list(dict.fromkeys([v.date() for v in df3['udate']])) # how many tradeing day
for day in days1:
    day_start = datetime(day.year, day.month, day.day, 6, 0, 0)
    day2 = day + timedelta(days=1)
    day_end = datetime(day2.year, day2.month, day2.day, 6, 0, 0)
    mask = ((df3['udate'] >= day_start) & (df3['udate'] <= day_end))
    df4 = df3.loc[mask]
    if (df4.shape[0] > 1):
        # 3.1 calculate vol by minute
        # df4['Volume'] = df4['Accumulated Volume'].diff()
        # 3.2 OBV 能量潮
        closes = np.array(df4['Close'], dtype='float')
        vols = np.array(df4['Volume'], dtype='float')
        df4['obv'] = talib.OBV(closes, vols)
        # 3.3 vol EMA
        df4['vol-sma5'] = talib.EMA(vols, timeperiod=5)
        # 3.4 %b
        df4['%b-high']  = common.percentB_belowzero(df4['%b'], df4['Close']) 
        df4['%b-low'] = common.percentB_aboveone(df4['%b'], df4['Close'])
        # 3.5 assgin to data
        data[day] = df4

In [113]:
# 4.0 draw chart
for k, df5 in data.items():
    # 4.1 style
    style = mpf.make_mpf_style(base_mpf_style='charles', rc={'font.size':6})
    # 4.2 addplot
    apds = [mpf.make_addplot(df5[['lower-band','upper-band']],panel=0,color='orange',linestyle='dashdot'),
            mpf.make_addplot(df5['%b-low'],type='scatter',markersize=20,marker='v'),
            mpf.make_addplot(df5['%b-high'],type='scatter',markersize=20,marker='^'),
            mpf.make_addplot(df5['p-sar'],scatter=True,markersize=1,marker='*',panel=0,color='blueviolet'),
            mpf.make_addplot((df5['vol-sma5']),panel=1,color='orange')]
    # 4.3 render
    if (True):
        print(df5.shape, df5['udate'].iloc[0], df5['udate'].iloc[-1])
        mpf.plot(df5, type='candle', addplot=apds, style=style, ylabel='', ylabel_lower='', volume=True, figscale=1, xrotation=0, datetime_format="%H:%M", show_nontrading=False, tight_layout=True, savefig='./data/img-nq/'+k.strftime('%m-%d-%Y')) 

(780, 25) 2020-09-30 17:01:00 2020-10-01 06:00:00
(1426, 25) 2020-10-01 06:00:00 2020-10-02 06:00:00
(585, 25) 2020-10-02 06:00:00 2020-10-02 15:59:00
(781, 25) 2020-10-04 17:00:00 2020-10-05 06:00:00
(1426, 25) 2020-10-05 06:00:00 2020-10-06 06:00:00
(1426, 25) 2020-10-06 06:00:00 2020-10-07 06:00:00
(1426, 25) 2020-10-07 06:00:00 2020-10-08 06:00:00
(1426, 25) 2020-10-08 06:00:00 2020-10-09 06:00:00
(585, 25) 2020-10-09 06:00:00 2020-10-09 15:59:00
(781, 25) 2020-10-11 17:00:00 2020-10-12 06:00:00
(1426, 25) 2020-10-12 06:00:00 2020-10-13 06:00:00
(1426, 25) 2020-10-13 06:00:00 2020-10-14 06:00:00
(1426, 25) 2020-10-14 06:00:00 2020-10-15 06:00:00
(1426, 25) 2020-10-15 06:00:00 2020-10-16 06:00:00
(585, 25) 2020-10-16 06:00:00 2020-10-16 15:59:00
(781, 25) 2020-10-18 17:00:00 2020-10-19 06:00:00
(1426, 25) 2020-10-19 06:00:00 2020-10-20 06:00:00
(1426, 25) 2020-10-20 06:00:00 2020-10-21 06:00:00
(1426, 25) 2020-10-21 06:00:00 2020-10-22 06:00:00
(1426, 25) 2020-10-22 06:00:00 2020-10

In [116]:
# 5.0 合併
df7 = pd.DataFrame()
for k, df6 in data.items():
    df7 = pd.concat([df6, df7], axis=0, join='outer', ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, copy=True)
# 5.1 清洗
df8 = df7.copy(deep=True)
"""drop_list = ['udate', 'code', 'name', 'nmll', 'bid', 'ask', 'bsize', 'asize', 'clast', 'Accumulated Volume', 'turnover', 
             'currency',  'yrhigh', 'yrlow', 'tnover_sc', 'lotsize', 'calllv', 'issued', 'ichng', 'ucode', 'ratio', 'strike', 'mdate', 
             'issuer', 'traded', 'ocode', 'eikon_udate', 'interactive_udate', 'instrument_type', 'stime', 'udate2']"""
drop_list = ['udate']
df8.drop(drop_list, axis=1, inplace=True)
df8.fillna(0, inplace=True)
is_contain_null = df8.isnull().sum()
is_contain_nan = df8.isna().sum()
print('Total dataset has {} samples, and {} features.'.format(df8.shape[0], df8.shape[1])) # df8.info()
# 5.2 分包
train_data = df8.iloc[0 : 12744]
test_data = df8.iloc[12745 : 13268]
print('Train Data: {}, Test Data: {}'.format(train_data.shape, test_data.shape))
# 5.3 模型数据
# 5.3.1 train data
x_train, y_train = [], []
for k, v in train_data.iterrows():
    x_train.append(v.tolist())
    y_train.append(v['Close'])
x_train, y_train =  np.array(x_train), np.array(y_train)
x_train = np.reshape(x_train, (x_train.shape[0],  x_train.shape[1],  1))
print('X_train Data: {}, Y_train Data: {}'.format(x_train.shape, y_train.shape))
# 5.3.2 test data
x_test, y_test = [], []
for k, v in test_data.iterrows():
    x_test.append(v.tolist())
    y_test.append(v['Close'])
x_test, y_test =  np.array(x_test), np.array(y_test)
x_test = np.reshape(x_test, (x_test.shape[0],  x_test.shape[1],  1))
print('X_Test Data: {}, Y_Test Data: {}'.format(x_test.shape, y_test.shape))

Total dataset has 28600 samples, and 24 features.
Train Data: (12744, 24), Test Data: (523, 24)
X_train Data: (12744, 24, 1), Y_train Data: (12744,)
X_Test Data: (523, 24, 1), Y_Test Data: (523,)


In [118]:
# 6.0 模型参数
batch_size = 32
epochs = 10 # 80
units = 1000 # 1000
cur_time = datetime.now().strftime("%Y%m%d-%H%M%S")

# 6.1 模型
model_lstm = tf.keras.Sequential()
model_lstm.add(tf.keras.layers.LSTM(units=units, return_sequences=True, input_shape=(x_train.shape[1], x_train.shape[2])))
model_lstm.add(Dropout(0.2))
model_lstm.add(tf.keras.layers.LSTM(units=units, return_sequences=True))
model_lstm.add(Dropout(0.2))
model_lstm.add(tf.keras.layers.LSTM(units=units, return_sequences=True))
model_lstm.add(Dropout(0.2))
model_lstm.add(tf.keras.layers.LSTM(units=units, return_sequences=True))
model_lstm.add(Dropout(0.2))
model_lstm.add(tf.keras.layers.Dense(units=1))
model_lstm.add(Dropout(0.2))
model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_lstm.summary()

# 6.2.1 創建check point
checkpoint_dir = './training_checkpoints/nq-lstm-' + cur_time
os.mkdir(checkpoint_dir)
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_prefix, save_weights_only=True)

# 6.2.2 創建tensor board
log_dir = os.path.join('./logs/fit/nq-lstm-') + cur_time
os.mkdir(log_dir)
tensor_board_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

# 6.3 fit model
history = model_lstm.fit(x_train, x_train, epochs = epochs, batch_size = batch_size, 
                         callbacks=[checkpoint_callback, tensor_board_callback])

# 6.4 儲存模型
model_path = "./saved_model/nq_lstm_"+cur_time+".h5"
model_lstm.save(model_path)

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_20 (LSTM)               (None, 24, 1000)          4008000   
_________________________________________________________________
dropout_25 (Dropout)         (None, 24, 1000)          0         
_________________________________________________________________
lstm_21 (LSTM)               (None, 24, 1000)          8004000   
_________________________________________________________________
dropout_26 (Dropout)         (None, 24, 1000)          0         
_________________________________________________________________
lstm_22 (LSTM)               (None, 24, 1000)          8004000   
_________________________________________________________________
dropout_27 (Dropout)         (None, 24, 1000)          0         
_________________________________________________________________
lstm_23 (LSTM)               (None, 24, 1000)         