# DeepLob 系列论文

- https://arxiv.org/abs/2403.09267 # Deep Limit Order Book Forecasting
- https://arxiv.org/abs/1811.10041 # BDLOB: Bayesian Deep Convolutional Neural Networks for Limit Order Books
- https://arxiv.org/abs/1803.06917 # Universal features of price formation in financial markets: perspectives from Deep Learning
- https://arxiv.org/abs/2101.07107 # Deep Reinforcement Learning for Active High Frequency Trading
- https://arxiv.org/abs/1808.03668 # DeepLOB: Deep Convolutional Neural Networks for Limit Order Books
- https://ar5iv.labs.arxiv.org/html/2003.00130 # Transformers for limit order books
- https://arxiv.org/html/2303.16532v2 # Graph Portfolio: High-Frequency Factor Predictor via Heterogeneous Continual GNNs
- https://arxiv.org/abs/2112.08534 # Trading with the Momentum Transformer: An Intelligent and Interpretable Architecture

本项目尝试复现最简单的 CNN + LSTM 模型

In [None]:
import gc
import os

import tensorflow as tf
import pandas as pd
import numpy as np
import keras
from sklearn.metrics import classification_report
from sklearn import preprocessing

print("TensorFlow version:", tf.__version__)
print("CUDA Available:", tf.test.is_built_with_cuda())

print(tf.config.list_physical_devices('GPU'))
print(tf.config.list_logical_devices('GPU'))

本项目只需要 limit order book 数据

是否需要处理行情隔天开盘时的跳空？我觉得是需要的，但是所有的paper都不处理，可能搞学术的不想干脏活累活。

In [None]:
data = pd.read_csv('nr2412.csv')
data.columns = [col.split('.')[-1] if '.' in col else col for col in data.columns]
col = ['bid_price1', 'bid_volume1', 'ask_price1', 'ask_volume1', 'bid_price2', 'bid_volume2', 'ask_price2', 'ask_volume2', 'bid_price3', 'bid_volume3', 'ask_price3', 'ask_volume3', 'bid_price4', 'bid_volume4', 'ask_price4', 'ask_volume4', 'bid_price5', 'bid_volume5', 'ask_price5', 'ask_volume5']
data = data[col]
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)
gc.collect()

In [None]:
def generate_labels(df: pd.DataFrame, window: int = 120, alpha: float = 0.0004) -> pd.Series:
    mid_price = (df['bid_price1'] + df['ask_price1']) / 2
    past_mean = mid_price.rolling(window=window, min_periods=window).mean()
    future_mean = mid_price.shift(-window).rolling(window=window, min_periods=window).mean()
    price_change = (future_mean - past_mean) / past_mean
    labels = np.zeros(len(df))
    labels[price_change > alpha] = 1  # 上涨
    labels[price_change < -alpha] = -1  # 下跌
    return pd.Series(labels)

labels = generate_labels(data)
print(labels.value_counts())
changes = (labels != labels.shift()).sum()
print(f"label changes: {changes}")

In [4]:
of_data = pd.DataFrame(index=data.index)
for i in range(1, 6):
    of_data[f'bOF_{i}'] = 0
    of_data[f'aOF_{i}'] = 0

    bid_price_diff = data[f'bid_price{i}'].diff()
    ask_price_diff = data[f'ask_price{i}'].diff()
    bid_volume_diff = data[f'bid_volume{i}'].diff()
    ask_volume_diff = data[f'ask_volume{i}'].diff()

    of_data.loc[bid_price_diff > 0, f'bOF_{i}'] = data.loc[bid_price_diff > 0, f'bid_volume{i}']
    of_data.loc[bid_price_diff < 0, f'bOF_{i}'] = -data.loc[bid_price_diff < 0, f'bid_volume{i}']
    of_data.loc[bid_price_diff == 0, f'bOF_{i}'] = bid_volume_diff

    of_data.loc[ask_price_diff > 0, f'aOF_{i}'] = -data.loc[ask_price_diff > 0, f'ask_volume{i}']
    of_data.loc[ask_price_diff < 0, f'aOF_{i}'] = data.loc[ask_price_diff < 0, f'ask_volume{i}']
    of_data.loc[ask_price_diff == 0, f'aOF_{i}'] = ask_volume_diff


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime, timedelta

fig, ax = plt.subplots(figsize=(12, 6))
df = data.copy()
df = df[1000:8000]
df['mid_price'] = (df['bid_price1'] + df['ask_price1']) / 2
df['label'] = labels[1000:8000]
df['mid_price'].plot(color='black', linewidth=2, label='数值变化', ax=ax)

ymin, ymax = ax.get_ylim()

# 为不同的labels值添加背景色块
colors = {1: 'lightgreen', 0: 'lightgray', -1: 'salmon'}

# 找出连续的相同label区间
current_label = df['label'].iloc[0]
start_idx = df.index[0]
for i in range(1, len(df)):
    if df['label'].iloc[i] != current_label or i == len(df) - 1:
        # 如果标签变化或到达最后一个点，绘制前一个区间的色块
        end_idx = df.index[i]
        ax.fill_between(
            [start_idx, end_idx],
            ymin, ymax,
            color=colors[current_label],
            alpha=0.3,
        )
        # 更新开始位置和当前标签
        start_idx = end_idx
        current_label = df['label'].iloc[i]

# 如果最后一段没有画完，补充最后一段
if start_idx != df.index[-1]:
    ax.fill_between(
        [start_idx, df.index[-1]],
        ymin, ymax,
        color=colors[current_label],
        alpha=0.3
    )

# 添加网格（设置在色块上方）
ax.grid(True, linestyle='--', alpha=0.7, zorder=2)

# 确保数据线在最上层
ax.set_axisbelow(False)
for line in ax.get_lines():
    line.set_zorder(3)

plt.show()

In [6]:
data = of_data
length = data.shape[0]

train_data = data.iloc[1:int(length*0.8),:].reset_index(drop=True)
train_label = labels[1:int(length*0.8)]


valid_data = data.iloc[int(length*0.8):int(length*0.9),:].reset_index(drop=True)
valid_label = labels[int(length*0.8):int(length*0.9)]


test_data = data.iloc[int(length*0.9):int(length),:].reset_index(drop=True)
test_label = labels[int(length*0.9):]

# 归一化，注意要用train_data的均值和方差，对valid_data和test_data进行归一化，否则会引入未来信息
ss = preprocessing.StandardScaler().fit(train_data)
train_data = pd.DataFrame(ss.transform(train_data))
valid_data = pd.DataFrame(ss.transform(valid_data))
test_data  = pd.DataFrame(ss.transform(test_data))

In [None]:
def data_classification(X, Y, T):
    [N, D] = X.shape
    df = np.array(X)
    dY = np.array(Y)
    dataY = dY[T - 1:N]
    dataX = np.zeros((N - T + 1, T, D),dtype='float16')
    for i in range(T, N + 1):
        dataX[i - T] = df[i - T:i, :]
    dataY += 1
    return dataX.reshape(dataX.shape + (1,)), dataY

# 用过去T个tick的数据预测，这个参数可以调整，对gpu显存大小有影响，8G显存最多也就60了
T = 100
trainX_CNN, trainY_CNN = data_classification(train_data, train_label, T)
trainY_CNN = keras.utils.to_categorical(trainY_CNN, 3)

validX_CNN, validY_CNN = data_classification(valid_data, valid_label, T)
validY_CNN = keras.utils.to_categorical(validY_CNN, 3)
testX_CNN, testY_CNN = data_classification(test_data, test_label, T)

testY_CNN = keras.utils.to_categorical(testY_CNN, 3)
del train_data, train_label, valid_data, valid_label,test_data, test_label
del data
gc.collect()

In [None]:
from tensorflow import keras
from keras import layers, models, optimizers, Input


def create_deeplob(time_series, T):

    if time_series == 'O':
        input_tensor = Input(shape=(T,20,1))

        # Combine imbalance information across sides for each level of the order book
        layer_x = layers.Conv2D(filters=16, kernel_size=(1,2), strides=(1,2))(input_tensor)
        layer_x = layers.LeakyReLU(alpha=0.01)(layer_x)
        # Combine imbalance information across time for each side and level of the order book
        layer_x = layers.Conv2D(filters=16, kernel_size=(4,1), padding='same')(layer_x)
        layer_x = layers.LeakyReLU(alpha=0.01)(layer_x)
        layer_x = layers.Conv2D(filters=16, kernel_size=(4,1), padding='same')(layer_x)
        layer_x = layers.LeakyReLU(alpha=0.01)(layer_x)
    elif time_series == 'OF':
        input_tensor = Input(shape=(T,10,1))
        # Combine imbalance information across time for each side and level of the order book
        layer_x = layers.Conv2D(filters=16, kernel_size=(4,1), padding='same')(input_tensor)
        layer_x = layers.LeakyReLU(alpha=0.01)(layer_x)
        layer_x = layers.Conv2D(filters=16, kernel_size=(4,1), padding='same')(layer_x)
        layer_x = layers.LeakyReLU(alpha=0.01)(layer_x)
    else:
        raise ValueError('The time_series should be either O or OF')


    # Combine imbalance information across all levels of the book
    layer_x = layers.Conv2D(filters=16, kernel_size=(1,10))(layer_x)
    layer_x = layers.LeakyReLU(alpha=0.01)(layer_x)

    # Inception Module
    # Tower 1
    tower_1 = layers.Conv2D(filters=32, kernel_size=(1,1), padding='same')(layer_x)
    tower_1 = layers.LeakyReLU(alpha=0.01)(tower_1)
    tower_1 = layers.Conv2D(filters=32, kernel_size=(3,1), padding='same')(tower_1)
    tower_1 = layers.LeakyReLU(alpha=0.01)(tower_1)
    # Tower 2
    tower_2 = layers.Conv2D(filters=32, kernel_size=(1,1), padding='same')(layer_x)
    tower_2 = layers.LeakyReLU(alpha=0.01)(tower_2)
    tower_2 = layers.Conv2D(filters=32, kernel_size=(5,1), padding='same')(tower_2)
    tower_2 = layers.LeakyReLU(alpha=0.01)(tower_2)  
    # Tower 3
    tower_3 = layers.MaxPooling2D(pool_size=(3,1), padding='same', strides=(1,1))(layer_x)
    tower_3 = layers.Conv2D(filters=32, kernel_size=(1,1), padding='same')(tower_3)
    tower_3 = layers.LeakyReLU(alpha=0.01)(tower_3)

    # Concatenation and reshaping
    layer_x = layers.concatenate([tower_1, tower_2, tower_3], axis=-1)
    layer_x = layers.Reshape(target_shape=(T, 96))(layer_x)
    
    # Insert variational dropout layer
    # By setting training to true, we enable dropout during evaluation passes
    layer_x = layers.Dropout(0.1)(layer_x, training=True)
    
    # LSTM with 64 hidden units
    layer_x = layers.LSTM(units=64)(layer_x)
    
    # Final output layer
    output = layers.Dense(units=3, activation='softmax')(layer_x)
    
    model = models.Model(input_tensor, output)
    
    opt = optimizers.Adam(learning_rate=0.01, epsilon=1)
    model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])


    return model

deeplob = create_deeplob('OF', T)
# deeplob.summary()

early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, mode='auto')
checkpoint_filepath = './model_check/deeplob.weights.h5'
model_checkpoint_callback = keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath,
                                                            save_weights_only=True,
                                                            monitor='val_loss',
                                                            mode='auto',
                                                            save_best_only=True)
deeplob.fit(trainX_CNN, trainY_CNN, epochs=20, batch_size=128, verbose=2, validation_data=(validX_CNN, validY_CNN),
            callbacks=[model_checkpoint_callback, early_stopping])  


In [None]:
# del trainX_CNN, trainY_CNN, validX_CNN, validY_CNN
gc.collect()
# evaluate the model
deeplob.load_weights(checkpoint_filepath)
predictions = deeplob.predict(testX_CNN)
print(predictions)
results = keras.utils.to_categorical(np.argmax(predictions, axis=1), 3)
print(classification_report(testY_CNN, results, target_names=['Down', 'Stationary', 'Up']))

In [None]:
predictions