# LOB-CNN v1

- 高频交易数据图像化建模与收益预测有效性分析

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tqdm


import os
import time
from datetime import datetime
import glob

import utils as _U

In [5]:
# 中证A50成分股
# 信息来源：东方财富 20250324
code_list = [
    '688981sh', '603993sh', '603259sh', '601899sh', '601888sh', 
    '601816sh', '601766sh', '601668sh', '601600sh', '601318sh', 
    '601088sh', '601012sh', '600900sh', '600893sh', 
    '600887sh', '600660sh', '600585sh', '600519sh', '600436sh', 
    '600426sh', '600415sh', '600406sh', '600309sh', '600276sh', 
    '600176sh', '600036sh', '600031sh', '600030sh', '600028sh', 
    '600019sh', '600009sh', '300760sz', '300750sz', '300408sz', 
    '300124sz', '300122sz', '300015sz', '002714sz', '002594sz', 
    '002475sz', '002371sz', '002230sz', '002027sz', '000938sz', 
    '000792sz', '000725sz', '000333sz', '000063sz', '000002sz'
    ]  

### 磁盘取数 + 初步处理

In [None]:
# 获取 ~/ 目录下所有 snap_stkhf202101_*.sas7bdat 文件
for year_month in ['202111', '202112']:
    for code in code_list:
        file_path = glob.glob(f'/mnt/sdb1/HF{year_month[:4]}/L2HF{year_month[2:]}_L2/snap_stkhf{year_month}_{code}.sas7bdat')
        if file_path:
            train_df = pd.read_sas(file_path[0], format='sas7bdat', encoding='utf-8')
            train_df.to_csv(f'rawdata_L2HF{year_month[2:]}_L2/{code}.csv', index=False)
            print(f'{code}.csv saved. Shape: {train_df.shape}')
        else:
            print(f'{code} not found')
# 600941sh 中国移动上市日期 2022-01-05，晚于2021年11月

688981sh.csv saved. Shape: (112535, 110)
603993sh.csv saved. Shape: (112915, 110)
603259sh.csv saved. Shape: (112969, 110)
601899sh.csv saved. Shape: (113636, 110)
601888sh.csv saved. Shape: (113026, 110)
601816sh.csv saved. Shape: (109638, 110)
601766sh.csv saved. Shape: (111964, 110)
601668sh.csv saved. Shape: (112059, 110)
601600sh.csv saved. Shape: (113377, 110)
601318sh.csv saved. Shape: (113140, 110)
601088sh.csv saved. Shape: (111852, 110)
601012sh.csv saved. Shape: (113857, 110)
600941sh not found
600900sh.csv saved. Shape: (102694, 110)
600893sh.csv saved. Shape: (112947, 110)
600887sh.csv saved. Shape: (113381, 110)
600660sh.csv saved. Shape: (111974, 110)
600585sh.csv saved. Shape: (112096, 110)
600519sh.csv saved. Shape: (112195, 110)
600436sh.csv saved. Shape: (111181, 110)
600426sh.csv saved. Shape: (112122, 110)
600415sh.csv saved. Shape: (110154, 110)
600406sh.csv saved. Shape: (112640, 110)
600309sh.csv saved. Shape: (112288, 110)
600276sh.csv saved. Shape: (113358, 11

In [None]:
level2_col_list = []
for l in range(1, 11):
    BidPr_col = f'BidPr{l}'
    BidVol_col = f'BidVol{l}'
    AskPr_col = f'AskPr{l}'
    AskVol_col = f'AskVol{l}'
    level2_col_list.extend([BidPr_col, BidVol_col, AskPr_col, AskVol_col])
    
used_cols = [
    'datetime', 'Exchflg', 'Code', 'Code_Mkt', 'Qdate', 'QTime', 'InstrumentStatus', 'Trdirec',
    'PrevClPr', 'OpPr', 'HiPr', 'LoPr', 'Tprice', 'Tvolume', 'Tsum', 'Tdeals', 'TVolume_accu', 'TSum_accu', 'Tdeals_accu',
    'TotBidVol', 'WghtAvgBidPr', 'TotAskVol', 'WghtAvgAskPr',
    'Absspread', 'Respread', 'Abseffspread', 'Reeffspread', 'Depth1', 'Depth2'
]
used_cols.extend(level2_col_list)

for year_month in ['202111', '202112']:
    for code in code_list:
        train_df = pd.read_csv(f'rawdata_L2HF{year_month[2:]}_L2/{code}.csv')

        # convert all df in dfs to datetime
        date = pd.to_datetime(train_df['Qdate'])
        time = pd.to_timedelta(train_df['QTime'], unit='s')
        train_df['datetime'] = date + time
        
        # select used cols
        train_df = train_df[used_cols]
        train_df = _U.trading_time_slice(train_df)
        train_df.to_csv(f'data_{year_month}/{code}.csv', index=False)
        print(f'{code}.csv saved. Shape: {train_df.shape}')

688981sh.csv saved. Shape: (104569, 69)
603993sh.csv saved. Shape: (104395, 69)
603259sh.csv saved. Shape: (104306, 69)
601899sh.csv saved. Shape: (104291, 69)
601888sh.csv saved. Shape: (104426, 69)
601816sh.csv saved. Shape: (103177, 69)
601766sh.csv saved. Shape: (104679, 69)
601668sh.csv saved. Shape: (104617, 69)
601600sh.csv saved. Shape: (104298, 69)
601318sh.csv saved. Shape: (104302, 69)
601088sh.csv saved. Shape: (104527, 69)
601012sh.csv saved. Shape: (104276, 69)
600900sh.csv saved. Shape: (95506, 69)
600893sh.csv saved. Shape: (104335, 69)
600887sh.csv saved. Shape: (104299, 69)
600660sh.csv saved. Shape: (104491, 69)
600585sh.csv saved. Shape: (104576, 69)
600519sh.csv saved. Shape: (104630, 69)
600436sh.csv saved. Shape: (104360, 69)
600426sh.csv saved. Shape: (104492, 69)
600415sh.csv saved. Shape: (103701, 69)
600406sh.csv saved. Shape: (104440, 69)
600309sh.csv saved. Shape: (104428, 69)
600276sh.csv saved. Shape: (104296, 69)
600176sh.csv saved. Shape: (104492, 69)
6

### Analysis

In [None]:
price_cols = [f'BidPr{i}' for i in range(1, 6)] + [f'AskPr{i}' for i in range(1, 6)]
price_cols += [f'BidPr{i}_lag{t}' for i in range(1, 11) for t in range(1, 10)]
price_cols += [f'AskPr{i}_lag{t}' for i in range(1, 11) for t in range(1, 10)]
# for code in code_list:
#     df = pd.read_csv(f'data_202111/{code}.csv')
    
price_cols

### DataGen: Image + Label

In [6]:
def calc_new_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    计算新的特征
    """
    df['mid_price'] = (df['BidPr1'] + df['AskPr1']) / 2
    
    pred_cnt = 5
    df['TWAP_mid'] = df['mid_price'].rolling(window=pred_cnt).mean()
    
    return df

In [15]:
# imaging data
code_list = [
    '600176sh', 
]
folder_path = 'data_202111/'
record_cnt = 5
pred_cnt = 5
is_binary = True

image_list = []
label_list = []
for code in code_list:
    train_df = pd.read_csv(f'{folder_path}{code}.csv')
    train_df['datetime'] = pd.to_datetime(train_df['datetime'])
    # train_df = train_df[train_df['datetime'] <= '2021-12-10 23:59:59']  # 测试集初步只取前10天
    train_df = calc_new_features(train_df)
    train_df.dropna(axis=0, inplace=True)
    train_df.reset_index(drop=True, inplace=True)
    for i in tqdm.tqdm(range(len(train_df) - (record_cnt + pred_cnt))):
        single_entry = _U.single_image(train_df.iloc[i:i + record_cnt + pred_cnt], record_cnt, pred_cnt, is_binary)
        image_list.append(single_entry[0])
        label_list.append(single_entry[1])
    print(f'{code}.csv loaded.')
    


100%|██████████| 104478/104478 [02:15<00:00, 773.67it/s]

600176sh.csv loaded.





- getting `df` directly without saving and loading

In [16]:
image_df = pd.DataFrame({
    'image': image_list,
    'label': label_list
})

# Flatten images and create a new dataframe
flatten_data = []
for i in range(len(image_df)):
    flattened_image = image_df.loc[i, 'image'].flatten()
    label = image_df.loc[i, 'label']
    flatten_data.append(np.concatenate(([label], flattened_image)))  # Label in first column
    
columns = ['label'] + [f'pixel_{i}' for i in range(41 * 15)]
flatten_df = pd.DataFrame(flatten_data, columns=columns)
print(flatten_df.shape)

(104478, 616)


In [17]:
train_df = flatten_df.copy()

- Save and downsample  `image_list` and `label_list`

In [7]:
image_df = pd.DataFrame({
    'image': image_list,
    'label': label_list
})

sample_size = 10000
random_indices = np.random.choice(image_df.shape[0], sample_size, replace=False)
print(len(random_indices))
# Flatten images and create a new dataframe
flatten_data = []
for i in random_indices:
    flattened_image = image_df.loc[i, 'image'].flatten()
    label = image_df.loc[i, 'label']
    flatten_data.append(np.concatenate(([label], flattened_image)))  # Label in first column

columns = ['label'] + [f'pixel_{i}' for i in range(41 * 15)]
flatten_df = pd.DataFrame(flatten_data, columns=columns)
print(flatten_df.shape)

# Save to CSV
current_time = datetime.now().strftime('%Y%m%d_%H%M')
flatten_df.to_csv(f'image_dataset_{current_time}.csv', index=False)

# write a log in JSON for image_df
import json
log = {
    'original_data_folder_path': folder_path,
    'code_list': code_list,
    'record_cnt': record_cnt,
    'pred_cnt': pred_cnt,
    'ori_shape': image_df.shape,
    'sample_size': sample_size,
    'datetime': current_time
}
log_path = f'image_dataset_{current_time}.json'
with open(log_path, 'w') as f:
    json.dump(log, f, indent=4)
print(f'log saved to {log_path}')


10000
(10000, 616)
log saved to image_dataset_20250325_1447.json


- Load downsampled `image_df` and re-structure the np.array

In [None]:
train_df = pd.read_csv('image_dataset_20250324_2212.csv')
print(train_df.shape)

# Prepare data for CNN (reshape 615 -> 41x15)
features = train_df.iloc[:, 1:].values
train_X_cnn = features.reshape(-1, 41, 15)  # Shape: (num_samples, 41, 15)

train_labels = train_df.iloc[:, 0].values

(50000, 616)


### Models

In [2]:
train_df = pd.read_csv('image_dataset_20250324_2212.csv')
test_df = pd.read_csv('image_dataset_20250325_1447.csv')
print(train_df.shape, test_df.shape)

(50000, 616) (10000, 616)


In [18]:
train_df.dropna(axis=0, inplace=True)
train_df.reset_index(drop=True, inplace=True)
test_df.dropna(axis=0, inplace=True)
test_df.reset_index(drop=True, inplace=True)
print(train_df.shape, test_df.shape)

(104478, 616) (37973, 616)


In [None]:
# Preprare data
train_X_flat = train_df.drop(columns=['label']).values
train_X_cnn = train_X_flat.reshape(-1, 41, 15)  # Shape: (num_samples, 41, 15)
train_labels = train_df['label'].values 
train_labels = train_labels.astype(int)

test_X_flat = test_df.drop(columns=['label']).values
test_X_cnn = test_X_flat.reshape(-1, 41, 15)  # Shape: (num_samples, 41, 15)
test_labels = test_df['label'].values
test_labels = test_labels.astype(int)

print(f"Train label distribution: \n{train_df['label'].value_counts(normalize=True)}")
print(f"Test label distribution: \n{test_df['label'].value_counts(normalize=True)}")

Train label distribution: 
label
0.0    0.843632
1.0    0.156368
Name: proportion, dtype: float64
Test label distribution: 
label
0.0    32656
1.0     5317
Name: count, dtype: int64


In [None]:
# randomly sample 10000 data from train_df and test_df
sample_size = 10000
random_indices_train = np.random.choice(train_df.shape[0], sample_size, replace=False)
random_indices_test = np.random.choice(test_df.shape[0], sample_size, replace=False)

train_X_flat = train_X_flat[random_indices_train]
test_X_flat = test_X_flat[random_indices_test]

train_labels = train_labels[random_indices_train]
test_labels = test_labels[random_indices_test]

In [26]:
# traditional ML pipeline
dataset = {
    'train_X': train_X_flat,
    'train_y': train_labels,
    'test_X': test_X_flat,
    'test_y': test_labels
}
df_results = _U.traditional_ml_pipeline(dataset, balance=False, data_type='num')
print(df_results.sort_values('F1', ascending=False))

--- Logistic Regression (SGD) ---
Time elapsed:  1.7294812202453613 (s)
{'Model': 'Logistic Regression (SGD)', 'Accuracy': 0.8377, 'Precision': 0.32532051282051283, 'Recall': 0.14448398576512456, 'F1': 0.20009857072449483} 





--- Linear SVM ---
Time elapsed:  93.42627429962158 (s)
{'Model': 'Linear SVM', 'Accuracy': 0.8547, 'Precision': 0.38235294117647056, 'Recall': 0.05551601423487545, 'F1': 0.09695463020509633} 

--- XGBoost ---
Time elapsed:  269.83997535705566 (s)
{'Model': 'XGBoost', 'Accuracy': 0.8575, 'Precision': 0.4758454106280193, 'Recall': 0.1402135231316726, 'F1': 0.2166025288620121} 

--- MLP ---
Time elapsed:  2.337718963623047 (s)
{'Model': 'MLP', 'Accuracy': 0.8573, 'Precision': 0.36904761904761907, 'Recall': 0.02206405693950178, 'F1': 0.041638683680322364} 

Report saved to: reports/binary_Traditional_unbalanced_20250325_1644.csv
                       Model  Accuracy  Precision    Recall        F1
2                    XGBoost    0.8575   0.475845  0.140214  0.216603
0  Logistic Regression (SGD)    0.8377   0.325321  0.144484  0.200099
1                 Linear SVM    0.8547   0.382353  0.055516  0.096955
3                        MLP    0.8573   0.369048  0.022064  0.041639


### compare to original data

In [None]:
# imaging data
code_list = [
    '600176sh', 
]
folder_path = 'data_202111/'
record_cnt = 5
pred_cnt = 5
is_binary = True

for code in code_list:
    train_df = pd.read_csv(f'{folder_path}{code}.csv')
    train_df['datetime'] = pd.to_datetime(train_df['datetime'])
    # train_df = train_df[train_df['datetime'] <= '2021-12-10 23:59:59']  # 测试集初步只取前10天
    train_df = calc_new_features(train_df)
    train_df.dropna(axis=0, inplace=True)
    train_df.reset_index(drop=True, inplace=True)
    print(f'{code}.csv loaded.')

folder_path = 'data_202112/'
for code in code_list:
    test_df = pd.read_csv(f'{folder_path}{code}.csv')
    test_df['datetime'] = pd.to_datetime(test_df['datetime'])
    test_df = test_df[test_df['datetime'] <= '2021-12-10 23:59:59']  # 测试集初步只取前10天
    test_df = calc_new_features(test_df)
    test_df.dropna(axis=0, inplace=True)
    test_df.reset_index(drop=True, inplace=True)
    print(f'{code}.csv loaded.')
