In [10]:
import tensorflow as tf
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as plt
import mplfinance as mpf
import talib as talib
import numpy as np
import data as ds
import common as common
import os as os

from mplfinance.original_flavor import candlestick_ohlc
from datetime import datetime, timedelta
from tensorflow.keras.layers import Dropout

In [11]:
pd.options.mode.chained_assignment = None
# 不让程序占满 GPU 内存
gpus = tf.config.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

In [12]:
file_1 = os.path.abspath(os.path.join('data', 'nq', '20201020_064100.csv'))
file_2 = os.path.abspath(os.path.join('data', 'nq', '20201023_064200.csv'))
df1 = pd.read_csv(file_1)
df2 = pd.read_csv(file_2)

In [13]:
# 1.0 reshapre dataframe 重構數據集
df1.index = pd.to_datetime(df1.stime)
df2.index = pd.to_datetime(df2.stime)
df3 = pd.concat([df1, df2], axis=0, join='outer', ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, copy=True)
df3 = df3.rename(columns={"high": "High", "low": "Low", "open": "Open", "last": "Close", "vol": "Accumulated Volume"})
df3['udate'] = pd.to_datetime(df3['udate'])
df3['udate2'] = [v.date() for v in df3['udate'].tolist()]
df3['udate2'] = pd.to_datetime(df3['udate2'])
df3['interactive_udate'] = pd.to_datetime(df3['interactive_udate'])
df3['mdate'] = pd.to_datetime(df3['mdate'])
df3['stime'] = pd.to_datetime(df3['stime'])
# 1.1 data validation 數據有效性檢查
shape = df3.shape
types = df3.dtypes
types1 = {'High': 'float64', 'Low': 'float64', 'Open': 'float64', 'Close': 'float64', 'Accumulated Volume': 'int64', 'chng': 'float64', 'pchng': 'float64'}
df3.astype(types1).dtypes
# 1.2 zero / nan
for k, v in types1.items():
    if (df3[k].isin([np.nan]).any().any()):
        print(k+' obtains nan')
    if (df3[k].isin([0]).any().any()):
        print(k+' obtains 0')
# 1.3 overview
is_contain_nan = df3.isnull().sum()

chng obtains 0
pchng obtains 0


In [14]:
# 2.0 technical indicator
highs = np.array(df3['High'], dtype='float')
lows = np.array(df3['Low'], dtype='float')
opens = np.array(df3['Open'], dtype='float')
closes = np.array(df3['Close'], dtype='float')
# 2.1 SMA 均線
for v in [5, 10, 20, 50, 80, 100, 120, 150, 180, 200]:
    df3['sma-'+str(v)]= talib.SMA(closes, timeperiod=v)
# 2.2 Bollinger 保力加
df3['upper-band'], df3['middle-band'], df3['lower-band'] = talib.BBANDS(closes, timeperiod=20, nbdevup=2, nbdevdn=2, matype=0)
# 2.3 %B %保力加
df3['%b'] = (df3['Close']-df3['lower-band'])/(df3['upper-band']-df3['lower-band'])*100
# 2.4 P-SAR 抛物线
df3['p-sar'] = talib.SAR(highs, lows, acceleration=0.02, maximum=0.2)

In [15]:
# 3.0 separate to daily data set
data = {}
days1 = list(dict.fromkeys(df3['udate2'].tolist())) # how many tradeing day
for day in days1:
    day_start = datetime(day.year, day.month, day.day, 6, 0, 0)
    day2 = day + timedelta(days=1)
    day_end = datetime(day2.year, day2.month, day2.day, 6, 0, 0)
    mask = ((df3['udate'] >= day_start) & (df3['udate'] <= day_end))
    df4 = df3.loc[mask]
    if (df4.shape[0] > 1):
        # 3.1 calculate vol by minute
        df4['Volume'] = df4['Accumulated Volume'].diff()
        # 3.2 OBV 能量潮
        closes = np.array(df4['Close'], dtype='float')
        vols = np.array(df4['Volume'], dtype='float')
        df4['obv'] = talib.OBV(closes, vols)
        # 3.3 vol EMA
        df4['vol-sma5'] = talib.EMA(vols, timeperiod=5)
        # 3.4 %b
        df4['%b-high']  = common.percentB_belowzero(df4['%b'], df4['Close']) 
        df4['%b-low'] = common.percentB_aboveone(df4['%b'], df4['Close'])
        # 3.5 assgin to data
        data[day] = df4

In [16]:
# 4.0 draw chart
for k, df5 in data.items():
    # 4.1 style
    style = mpf.make_mpf_style(base_mpf_style='charles', rc={'font.size':6})
    # 4.2 addplot
    apds = [mpf.make_addplot(df5[['lower-band','upper-band']],panel=0,color='orange',linestyle='dashdot'),
            mpf.make_addplot(df5['%b-low'],type='scatter',markersize=20,marker='v'),
            mpf.make_addplot(df5['%b-high'],type='scatter',markersize=20,marker='^'),
            mpf.make_addplot(df5['p-sar'],scatter=True,markersize=1,marker='*',panel=0,color='blueviolet'),
            mpf.make_addplot((df5['vol-sma5']),panel=1,color='orange')]
    # 4.3 render
    if (False):
        print(df5.shape, df5['udate'].iloc[0], df5['udate'].iloc[-1])
        mpf.plot(df5, type='line', style=style, ylabel='', ylabel_lower='', volume=True, figscale=0.5, xrotation=0, datetime_format="%H:%M", show_nontrading=False, tight_layout=True, addplot=apds) # savefig='./data/img-nq/'+k.strftime('%m-%d-%Y') 

In [17]:
# 5.0 合併
df7 = pd.DataFrame()
for k, df6 in data.items():
    df7 = pd.concat([df6, df7], axis=0, join='outer', ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, copy=True)
# 5.1 清洗
df8 = df7.copy(deep=True)
drop_list = ['udate', 'code', 'name', 'nmll', 'bid', 'ask', 'bsize', 'asize', 'clast', 'Accumulated Volume', 'turnover', 
             'currency',  'yrhigh', 'yrlow', 'tnover_sc', 'lotsize', 'calllv', 'issued', 'ichng', 'ucode', 'ratio', 'strike', 'mdate', 
             'issuer', 'traded', 'ocode', 'eikon_udate', 'interactive_udate', 'instrument_type', 'stime', 'udate2']
df8.drop(drop_list, axis=1, inplace=True)
df8.fillna(0, inplace=True)
is_contain_null = df8.isnull().sum()
is_contain_nan = df8.isna().sum()
print('Total dataset has {} samples, and {} features.'.format(df8.shape[0], df8.shape[1])) # df8.info()
# 5.2 分包
train_data = df8.iloc[0: 12744]
test_data = df8.iloc[12745: 13268]
print('Train Data: {}, Test Data: {}'.format(train_data.shape, test_data.shape))
# 5.3 模型数据
# 5.3.1 train data
x_train, y_train = [], []
for k, v in train_data.iterrows():
    x_train.append(v.tolist())
    y_train.append(v['Close'])
x_train, y_train =  np.array(x_train), np.array(y_train)
x_train = np.reshape(x_train, (x_train.shape[0],  x_train.shape[1],  1))
print('X_train Data: {}, Y_train Data: {}'.format(x_train.shape, y_train.shape))
# 5.3.2 test data
x_test, y_test = [], []
for k, v in test_data.iterrows():
    x_test.append(v.tolist())
    y_test.append(v['Close'])
x_test, y_test =  np.array(x_test), np.array(y_test)
x_test = np.reshape(x_test, (x_test.shape[0],  x_test.shape[1],  1))
print('X_Test Data: {}, Y_Test Data: {}'.format(x_test.shape, y_test.shape))

Total dataset has 13268 samples, and 26 features.
Train Data: (12744, 26), Test Data: (523, 26)
X_train Data: (12744, 26, 1), Y_train Data: (12744,)
X_Test Data: (523, 26, 1), Y_Test Data: (523,)


In [18]:
# 6.0
batch_size = None
num_steps = None
hidden_size = None
# 6.1 模型
model_lstm = tf.keras.Sequential()
model_lstm.add(tf.keras.layers.LSTM(units=1000, return_sequences=True, input_shape=(x_train.shape[1], x_train.shape[2])))
model_lstm.add(Dropout(0.2))
model_lstm.add(tf.keras.layers.LSTM(units=1000, return_sequences=True))
model_lstm.add(Dropout(0.2))
model_lstm.add(tf.keras.layers.LSTM(units=1000, return_sequences=True))
model_lstm.add(Dropout(0.2))
model_lstm.add(tf.keras.layers.LSTM(units=1000, return_sequences=True))
model_lstm.add(Dropout(0.2))
model_lstm.add(tf.keras.layers.Dense(units=1))
model_lstm.add(Dropout(0.2))
model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_lstm.summary()
# 6.1 fit model
model_lstm.fit(x_train, x_train, epochs = 10, batch_size = 32)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, 26, 1000)          4008000   
_________________________________________________________________
dropout_5 (Dropout)          (None, 26, 1000)          0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 26, 1000)          8004000   
_________________________________________________________________
dropout_6 (Dropout)          (None, 26, 1000)          0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 26, 1000)          8004000   
_________________________________________________________________
dropout_7 (Dropout)          (None, 26, 1000)          0         
_________________________________________________________________
lstm_7 (LSTM)                (None, 26, 1000)         

<tensorflow.python.keras.callbacks.History at 0x7ff7ad672990>