# Baseline Model Training and Evaluation

In [None]:
import numpy as np
import pandas as pd
import os, datetime
import tensorflow as tf
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
print('Tensorflow version: {}'.format(tf.__version__))

import matplotlib.pyplot as plt
plt.style.use('seaborn')

import warnings
warnings.filterwarnings('ignore')


## Load processed_data

In [None]:
#----- Data Path -----
processed_data_path = 'features/'

#----- Model Path -----
model_path = 'models/'
if not os.path.isdir(model_path):
    os.makedirs(model_path)
    print("made folder:", model_path)

model_name = 'baseline_model.hdf5'

In [None]:
df_book = pd.read_csv(processed_data_path+'book.csv')
df_trade = pd.read_csv(processed_data_path+'trade.csv')

In [None]:
df_book

In [None]:
df_book.columns.values

## ===== Feature Setting ===== 

## ----- Price Features -----

In [None]:
target = 'nextMidpt'

bid_price_features = ['Bid1', 'Bid2', 'Bid3', 'Bid4', 'Bid5'] 
# ------ use bid_price features
bid_price = True
# bid_price = False

ask_price_features = ['Ask1', 'Ask2', 'Ask3', 'Ask4', 'Ask5']
# ------ use ask_price features
ask_price = True
# ask_price = False

price_stats_features = ['MicroPrice', 'Bid_Mean', 'Ask_Mean']
# ------ use price_stats_features
price_stats = True
price_stats = False

speard_features = ['Spread1', 'Spread2', 'Spread3', 'Spread4', 'Spread5', 'SpreadMean']
# speard_features = ['Spread1']
# ------ use speard_features
speard = True
speard = False

# ------ use midpt 
midpt = True
# midpt = False

price_features = []
if bid_price:
    price_features = price_features + bid_price_features
if ask_price:
    price_features = price_features + ask_price_features
if price_stats:
    price_features = price_features + price_stats_features
if speard:
    price_features = price_features + speard_features
if midpt:
    price_features.append('midpt')

    

# ----- size_features -----

In [None]:
    
bid_size_features = ['Bid1SizeProp', 'Bid2SizeProp', 'Bid3SizeProp', 'Bid4SizeProp', 'Bid5SizeProp']
# ------ use bid_size_features
bid_size = True
# bid_size = False

ask_size_features = ['Ask1SizeProp', 'Ask2SizeProp', 'Ask3SizeProp', 'Ask4SizeProp', 'Ask5SizeProp']
# ------ use ask_size_features
ask_size = True
# ask_size = False

q_imb_features = ['Q_ImB1', 'Q_ImB2', 'Q_ImB3', 'Q_ImB4', 'Q_ImB5']
# q_imb_features = ['Q_ImB1']
# ------ use q_imb_features
q_imb = True
q_imb = False

ba_ratio_features = ['BidAskRatio1', 'BidAskRatio2', 'BidAskRatio3', 'BidAskRatio4', 'BidAskRatio5', 'BidAskRatioTotal']
# ------ use ba_ratio_features
ba_ratio = True
ba_ratio = False


size_features = [] 
if bid_size: 
    size_features = size_features + bid_size_features
if ask_size:
    size_features = size_features + ask_size_features 
if q_imb:
    size_features = size_features + q_imb_features 
if ba_ratio:
    size_features = size_features + ba_ratio_features 



## ===== Feature Engineering =====

In [None]:
# ----- feature engineering setting -----

# features to compute moving average
MA_list = []
# MA_list = MA_list + price_features
win_size = [5, 10, 20]

# features to normalization
Norm_list = []
# Norm_list = Norm_list + price_features + size_features
# set scalar, default is MinMax
Standard = False

# features to compute percentage change
PC_list = []
# PC_list = PC_list + price_features


### ----- Moving Average Features -----

In [None]:
print('Computing moving average of ', MA_list)

if MA_list != []:
    for feat in MA_list:
        for i in win_size:
            df_book[feat] = df_book[feat].rolling(i).mean() 
#             df_book[feat + '_MA' + str(i)] = df_book[feat].rolling(i).mean() 

In [None]:
# df_book

### ----- Percentage Change Features -----

In [None]:
print('Computing Percentage Change of ', PC_list)

if PC_list != []:
    for feat in PC_list:
        df_book[feat] = df_book[feat].pct_change()
#         df_book[feat + '_Pct'] = df_book[feat].pct_change()

### Check null, na, nan and drop rows with NaN values after feature engineering 

In [None]:
# check null, na, nan 
# print(df_book.columns.values)
# print('df_book.isnull().sum()', df_book.isnull().sum().values)
# print('df_book.isnull().sum()', df_book.isnull().sum().values)
# print('df_book.isnan().sum()', df_book.isna().sum().sum())
# print("df_book.isna().sum()", df_book.isna().sum().sum())

In [None]:
pd.set_option('use_inf_as_na', True)
df_book.dropna(how='any', axis=0, inplace=True)

## ===== Split Data into Training and Validation =====

In [None]:
#-----convert timestamp to date and seperate the training/validation set
df_book['DateTime'] = pd.to_datetime(df_book['TimeStamp'], unit='us')
df_trade['DateTime'] = pd.to_datetime(df_trade['TimeStamp'], unit='us')

df_book['Date'] = df_book['DateTime'].apply(lambda x: x.date())
pd.unique(df_book['Date'])[10]


In [None]:
# split df_val again for generalization test if have more time

df_train = df_book[df_book['DateTime']<='2020-04-16 07:00']
df_val = df_book[df_book['DateTime']>'2020-04-16 07:00']
# df_book_t = df_book[df_book['Date']>'2020-04-10 06:56']
# df_book_t[df_book_t['Date']>'2020-04-13 01:31']
df_train
df_val

## Normalization

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler



print('Normalizing ', Norm_list)
if Norm_list != []:
    if Standard:
        scaler = StandardScaler()
    else:
        scaler = MinMaxScaler(feature_range=(0, 1))
        
    # generate scaler with only training set for generalization
    scaler.fit(df_train[Norm_list])


    # normalization with scaler
    print(df_train.head())
    df_train[Norm_list] = scaler.transform(df_train[Norm_list])       
    print(df_train.head())

    print(df_val.head())
    df_val[Norm_list] = scaler.transform(df_val[Norm_list])
    print(df_val.head())



## Build Model 

## Generate model input

In [None]:
feature_list = price_features + size_features
# feature_list = price_features

print('Total number of features', len(feature_list))
print('feature_list: ', feature_list)
# shuffle data
# df_train = df_train.sample(frac=1).reset_index(drop=True)
x_train = df_train[feature_list].values
y_train = df_train[target].values

# shuffle data
# df_val = df_val.sample(frac=1).reset_index(drop=True)
x_val = df_val[feature_list].values
y_val = df_val[target].values


print("Shape of x, y train/val {} {} {} {}".format(x_train.shape, y_train.shape, x_val.shape, y_val.shape))

In [None]:
x_train

### ----- Baseline model -----

In [None]:
from IPython.display import SVG
# from tensorflow.keras.utils import Model
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Input 
from tensorflow.keras import optimizers
from tensorflow.keras.utils import model_to_dot, plot_model

params = {'batch_size': 128, 'epochs': 300, 'lr': 0.0005, 'optimizer': 'adam'}

dim = x_train.shape[1]

input_x = Input(shape=(dim,))

pred_y = Dense(1, activation='linear')(input_x)

model = Model(inputs=input_x, outputs=pred_y)

if params["optimizer"] == 'rmsprop':
    optimizer = optimizers.RMSprop(lr=params["lr"])
if params["optimizer"] == 'sgd':
    optimizer = optimizers.SGD(lr=params["lr"], decay=1e-6, momentum=0.9, nesterov=True)
if params["optimizer"] == 'adam':
    optimizer = optimizers.Adam(learning_rate=params["lr"], beta_1=0.9, beta_2=0.999, amsgrad=False)
if params["optimizer"] == 'nesterov':
    optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)


model.compile(loss='mean_squared_error', optimizer=params["optimizer"], metrics=[tf.keras.metrics.RootMeanSquaredError()])
# model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=[tf.keras.metrics.RootMeanSquaredError()])

model.summary()

plot_model(model, to_file='model.png', show_shapes=True, show_layer_names=True, 
expand_nested=True,dpi=96,)

# SVG(model_to_dot(model).create(prog='dot', format='svg'))

## Train Model

In [None]:
import os
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, CSVLogger, Callback

class LossHistory(Callback):
    def __init__(self, model):
        self.model = model
        self.epoch = 0
        self.train_losses = []
        self.val_losses = []        

    def on_epoch_end(self, batch, logs={}):
        self.train_losses.append(logs.get('loss'))
        self.val_losses.append(logs.get('val_loss'))
        if self.epoch % 10 == 0:
            print("epoch: {0} - train loss: {1:8.6f} - val loss: {2:8.6f}".format(
                self.epoch, 
                logs.get('loss'),
                logs.get('val_loss'),                
            ))
        self.model.reset_states()
        self.epoch += 1

best_model_path = os.path.join(model_path, model_name)
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,
                   patience=100, min_delta=0.0001)
# csv_logger = CSVLogger(os.path.join(OUTPUT_PATH, 'log_training_batch.log'), append=True)
rlp = ReduceLROnPlateau(monitor='val_loss', factor=0.02, patience=20, verbose=1, mode='min',
                        min_delta=0.001, cooldown=1, min_lr=0.0001)
mcp = ModelCheckpoint(best_model_path, monitor='val_loss', verbose=1,
                      save_best_only=True, save_weights_only=False, mode='min', period=1) 

history = LossHistory(model=model)  

In [None]:
%%time
model.fit(x_train, y_train, epochs=params['epochs'], verbose=1,
                            batch_size=params['batch_size'], shuffle=True,
                            validation_data=(x_val, y_val),
                            callbacks=[history, es, rlp, mcp])

## Evaluate Model

In [None]:
model = tf.keras.models.load_model(model_path+model_name)

In [None]:
from matplotlib import pyplot as plt

plt.figure(figsize=(20, 8))

plt.plot(history.train_losses)
plt.plot(history.val_losses)
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train loss', 'val_loss'], loc='upper right')
plt.show()

In [None]:
'''Calculate predictions and metrics'''

x_val = df_val[feature_list].values

pred = model.predict(x_val)
pred = np.squeeze(pred)
print(y_val.shape, pred.shape)

#Print evaluation metrics for all datasets
train_eval = model.evaluate(x_train, y_train, verbose=0)
val_eval = model.evaluate(x_val, y_val, verbose=0)

print(' ')
print('Evaluation metrics')
print('Training Data - MSE: {:.4f}, RMSE: {:.4f}'.format(train_eval[0], train_eval[1]))
print('Validation Data - MSE: {:.4f}, RMSE: {:.4f}'.format(val_eval[0], val_eval[1]))

In [None]:
plt.figure(figsize=(20, 8))

plt.plot(y_val[46000:])
plt.plot(pred[46000:])
plt.title('600519 nextMidpt prediction')
plt.ylabel('Price')
plt.xlabel('Tick')
plt.legend(['Real nextMidpt','Predicted nextMidpt'], loc='upper left')
plt.show()

In [None]:
model

In [None]:
model.get_weights()