In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

orderbook = "data/GOOG_2012-06-21_34200000_57600000_orderbook_10.csv"
message = "data/GOOG_2012-06-21_34200000_57600000_message_10.csv"
num_levels = 10

header_list = []
for i in range(num_levels):
    header_list = header_list + ["Pa%d"%(i+1),"Va%d"%(i+1),"Pb%d"%(i+1),"Vb%d"%(i+1)]
df_orderbook = pd.read_csv(orderbook,header=None,names=header_list)

df_message = pd.read_csv(message,usecols = [0,1,3,4,5], names=['time', 'type','size','price','direction'])
df_message.index = pd.Timestamp(datetime.date.today()) + pd.TimedeltaIndex(df_message.time, unit='s')
df_orderbook.index = df_message.index

def labelling(a):
    if a > 0:
        b = 1
    else:
        b= 0
    return b

# Spreads and mid-prices
def feature_v2(num_levels,df): # 20
    for i in range(1,num_levels+1):
        df["spread%d"%(i)] = df["Pa%d"%(i)] - df["Pb%d"%(i)]
        df["midprice%d"%(i)] = (df["Pa%d"%(i)] + df["Pb%d"%(i)])/2
    return df

def feature_v3(num_levels,df): # 20 - 2
    for i in range(1, num_levels):
        df["PA_diff%d"%(i)] = df["Pa%d"%(i+1)] - df["Pa%d"%(i)]
        df["PB_diff%d"%(i)] = df["Pb%d"%(i)] - df["Pb%d"%(i+1)]
    return df

def feature_v4(num_levels,df): # 4
    lst = ["Pa%d"%(i+1) for i in range(num_levels)]
    df["Pa_mean"] = df[df.columns.intersection(lst)].sum(axis=1)    
    
    lst = ["Pb%d"%(i+1) for i in range(num_levels)]
    df["Pb_mean"] = df[df.columns.intersection(lst)].sum(axis=1)
    
    lst = ["Va%d"%(i+1) for i in range(num_levels)]
    df["Va_mean"] = df[df.columns.intersection(lst)].sum(axis=1)
    
    lst = ["Vb%d"%(i+1) for i in range(num_levels)]
    df["Vb_mean"] = df[df.columns.intersection(lst)].sum(axis=1)
    return df

def feature_v5(num_levels,df): # 20
    for i in range(num_levels): #
        df["pri_accum_diff%d"%(i+1)] = 0
        df["vol_accum_diff%d"%(i+1)] = 0
        for k in range(i):
            df["pri_accum_diff%d"%(i+1)] += (df["Pa%d"%(k+1)] - df["Pb%d"%(k+1)])
            df["vol_accum_diff%d"%(i+1)] += (df["Va%d"%(i+1)] - df["Vb%d"%(i+1)])
    return df

def normalize_input(X_train, X_test):
    scaler = StandardScaler()
    # Fit on training set only.
    scaler.fit(X_train)
    # Apply transform to both the training set and the test set.
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test

df_orderbook = feature_v2(num_levels,df_orderbook)
df_orderbook = feature_v3(num_levels,df_orderbook)
df_orderbook = feature_v4(num_levels,df_orderbook)
df_orderbook = feature_v5(num_levels,df_orderbook)

# df_orderbook.head()
# window = 1000
# df_orderbook['midprice_win%d'%window] = df_orderbook.rolling(window).agg({'midprice1':'mean'})
# df_orderbook['midprice_win%d'%window].plot()

# window = '1MIN'
# df_orderbook['midprice_average'+window] = df_orderbook.rolling('1MIN').agg({'midprice1':'mean'})
# df_orderbook['midprice_average'+window].plot()
# plt.show()



from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score, confusion_matrix
def model_scoring(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print("======================================")
    print("======Model Performance===============")

    print("the accuracy is: ", accuracy)
    print("the precision is：",precision)
    print("the recall is: ", recall)
    print("the f1 score is: ", f1)
    print("confution matrix: \n", confusion_matrix(y_test, y_pred))
    
def split_sequence(X_sequence, y_sequence, n_steps):
    X, y = list(), list()
    for i in range(0, len(X_sequence)-n_steps+1):
        seq_x, seq_y = X_sequence[i: i+n_steps], y_sequence[i + n_steps - 1]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

####################################################################
####################################################################

df_orderbook1s = df_orderbook.resample('1S').first()
df_orderbook1s["price_10min"] = df_orderbook1s['midprice1'].shift(-600)
df_orderbook1s.dropna(inplace=True)
df_orderbook1s["price_change"] = df_orderbook1s["price_10min"] - df_orderbook1s['midprice1']

X = df_orderbook1s.drop(['price_10min', 'price_change'], axis=1).values
y = np.array(list(map(labelling, df_orderbook1s["price_change"])))
####################################################################
####################################################################

In [3]:
X.shape

(10743, 102)

In [4]:
X.shape

(10743, 102)

In [5]:
y.shape

(10743,)

# RNN (LSTM)

In [None]:
# https://machinelearningmastery.com/how-to-develop-lstm-models-for-time-series-forecasting/
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Input, Dense, GRU, Embedding, LSTM
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv1D
######################################
n_steps = 60
# prepare train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_test = normalize_input(X_train, X_test)
# split into samples
X_train_steps, y_train_steps = split_sequence(X_train, y_train, n_steps)
X_test_steps, y_test_steps = split_sequence(X_test, y_test, n_steps)
######################################
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(n_steps, 102)))
model.add(Dense(32))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(optimizer='adam', loss='mse')
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
print(model.summary())
######################################
model.fit(X_train_steps, y_train_steps, 
          batch_size=32, epochs=30, validation_split=0.3)
######################################
y_pred = model.predict(X_test_steps)
y_pred[y_pred>0.5] = 1
y_pred[y_pred<0.5] = 0
model_scoring(y_test_steps, y_pred)
######################################

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 50)                30600     
_________________________________________________________________
dense_11 (Dense)             (None, 32)                1632      
_________________________________________________________________
activation_25 (Activation)   (None, 32)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 33        
_________________________________________________________________
activation_26 (Activation)   (None, 1)                 0         
Total params: 32,265
Trainable params: 32,265
Non-trainable params: 0
_________________________________________________________________
None
Train on 5222 samples, validate on 2239 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6

In [None]:
# logistic regression
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_test = normalize_input(X_train, X_test)
clf = LogisticRegression(random_state=0, 
                         penalty='l2',
                         solver='lbfgs',
                         max_iter = 100000,
                         multi_class='ovr').fit(X_train, y_train)
print("training score.", clf.score(X_train, y_train))
print("testing score.", clf.score(X_test, y_test))
y_pred = clf.predict(X_test)
model_scoring(y_test, y_pred)

In [None]:
############################
 ######### XGBoost ########
############################
import xgboost as xgb
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test = normalize_input(X_train, X_test)
xgb_clf = xgb.XGBClassifier()
xgb_clf.fit(X_train, y_train)
y_pred = xgb_clf.predict(X_test)
model_scoring(y_test, y_pred)

In [14]:
############################
 ########## CNN ##########
############################
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.layers import Conv1D,  MaxPooling1D
import pickle
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train, X_test = normalize_input(X_train, X_test)
model = Sequential()
model.add(Conv1D(256, 4, input_shape=(102, 1)))
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(256, 2))
# model.add(Dropout(0.5))
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(128, 1))
model.add(Dropout(0.5))
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())  # this converts our 3D feature maps to 1D feature vectors
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

model.fit(X_train.reshape((X_train.shape[0],X_train.shape[1],1)), np.asarray(y_train), 
          batch_size=32, epochs=50, validation_split=0.3)
y_pred = model.predict(X_test.reshape((X_test.shape[0],X_test.shape[1],1)))
y_pred[y_pred>0.5] = 1
y_pred[y_pred<0.5] = 0
model_scoring(y_test, y_pred.reshape((-1,)))

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_12 (Conv1D)           (None, 99, 256)           1280      
_________________________________________________________________
activation_17 (Activation)   (None, 99, 256)           0         
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 49, 256)           0         
_________________________________________________________________
conv1d_13 (Conv1D)           (None, 48, 256)           131328    
_________________________________________________________________
activation_18 (Activation)   (None, 48, 256)           0         
_________________________________________________________________
max_pooling1d_11 (MaxPooling (None, 24, 256)           0         
_________________________________________________________________
conv1d_14 (Conv1D)           (None, 24, 128)          

In [None]:
# https://keras.io/examples/imdb_cnn_lstm/