# LOB-CNN v1

- 高频交易数据图像化建模与收益预测有效性分析

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tqdm


import os
import time
from datetime import datetime
import glob

import utils as _U

In [2]:
# 中证A50成分股
# 信息来源：东方财富 20250324
code_list = [
    '688981sh', '603993sh', '603259sh', '601899sh', '601888sh', 
    '601816sh', '601766sh', '601668sh', '601600sh', '601318sh', 
    '601088sh', '601012sh', '600900sh', '600893sh', 
    '600887sh', '600660sh', '600585sh', '600519sh', '600436sh', 
    '600426sh', '600415sh', '600406sh', '600309sh', '600276sh', 
    '600176sh', '600036sh', '600031sh', '600030sh', '600028sh', 
    '600019sh', '600009sh', '300760sz', '300750sz', '300408sz', 
    '300124sz', '300122sz', '300015sz', '002714sz', '002594sz', 
    '002475sz', '002371sz', '002230sz', '002027sz', '000938sz', 
    '000792sz', '000725sz', '000333sz', '000063sz', '000002sz'
    ]  

### 磁盘取数 + 初步处理

In [10]:
# 获取 ~/ 目录下所有 snap_stkhf202101_*.sas7bdat 文件
for year_month in ['202111', '202112']:
    for code in code_list:
        file_path = glob.glob(f'/mnt/sdb1/HF{year_month[:4]}/L2HF{year_month[2:]}_L2/snap_stkhf{year_month}_{code}.sas7bdat')
        if file_path:
            df = pd.read_sas(file_path[0], format='sas7bdat', encoding='utf-8')
            df.to_csv(f'rawdata_L2HF{year_month[2:]}_L2/{code}.csv', index=False)
            print(f'{code}.csv saved. Shape: {df.shape}')
        else:
            print(f'{code} not found')
# 600941sh 中国移动上市日期 2022-01-05，晚于2021年11月

688981sh.csv saved. Shape: (112535, 110)
603993sh.csv saved. Shape: (112915, 110)
603259sh.csv saved. Shape: (112969, 110)
601899sh.csv saved. Shape: (113636, 110)
601888sh.csv saved. Shape: (113026, 110)
601816sh.csv saved. Shape: (109638, 110)
601766sh.csv saved. Shape: (111964, 110)
601668sh.csv saved. Shape: (112059, 110)
601600sh.csv saved. Shape: (113377, 110)
601318sh.csv saved. Shape: (113140, 110)
601088sh.csv saved. Shape: (111852, 110)
601012sh.csv saved. Shape: (113857, 110)
600941sh not found
600900sh.csv saved. Shape: (102694, 110)
600893sh.csv saved. Shape: (112947, 110)
600887sh.csv saved. Shape: (113381, 110)
600660sh.csv saved. Shape: (111974, 110)
600585sh.csv saved. Shape: (112096, 110)
600519sh.csv saved. Shape: (112195, 110)
600436sh.csv saved. Shape: (111181, 110)
600426sh.csv saved. Shape: (112122, 110)
600415sh.csv saved. Shape: (110154, 110)
600406sh.csv saved. Shape: (112640, 110)
600309sh.csv saved. Shape: (112288, 110)
600276sh.csv saved. Shape: (113358, 11

In [13]:
level2_col_list = []
for l in range(1, 11):
    BidPr_col = f'BidPr{l}'
    BidVol_col = f'BidVol{l}'
    AskPr_col = f'AskPr{l}'
    AskVol_col = f'AskVol{l}'
    level2_col_list.extend([BidPr_col, BidVol_col, AskPr_col, AskVol_col])
    
used_cols = [
    'datetime', 'Exchflg', 'Code', 'Code_Mkt', 'Qdate', 'QTime', 'InstrumentStatus', 'Trdirec',
    'PrevClPr', 'OpPr', 'HiPr', 'LoPr', 'Tprice', 'Tvolume', 'Tsum', 'Tdeals', 'TVolume_accu', 'TSum_accu', 'Tdeals_accu',
    'TotBidVol', 'WghtAvgBidPr', 'TotAskVol', 'WghtAvgAskPr',
    'Absspread', 'Respread', 'Abseffspread', 'Reeffspread', 'Depth1', 'Depth2'
]
used_cols.extend(level2_col_list)

for year_month in ['202111', '202112']:
    for code in code_list:
        df = pd.read_csv(f'rawdata_L2HF{year_month[2:]}_L2/{code}.csv')

        # convert all df in dfs to datetime
        date = pd.to_datetime(df['Qdate'])
        time = pd.to_timedelta(df['QTime'], unit='s')
        df['datetime'] = date + time
        
        # select used cols
        df = df[used_cols]
        df = _U.trading_time_slice(df)
        df.to_csv(f'data_{year_month}/{code}.csv', index=False)
        print(f'{code}.csv saved. Shape: {df.shape}')

688981sh.csv saved. Shape: (104569, 69)
603993sh.csv saved. Shape: (104395, 69)
603259sh.csv saved. Shape: (104306, 69)
601899sh.csv saved. Shape: (104291, 69)
601888sh.csv saved. Shape: (104426, 69)
601816sh.csv saved. Shape: (103177, 69)
601766sh.csv saved. Shape: (104679, 69)
601668sh.csv saved. Shape: (104617, 69)
601600sh.csv saved. Shape: (104298, 69)
601318sh.csv saved. Shape: (104302, 69)
601088sh.csv saved. Shape: (104527, 69)
601012sh.csv saved. Shape: (104276, 69)
600900sh.csv saved. Shape: (95506, 69)
600893sh.csv saved. Shape: (104335, 69)
600887sh.csv saved. Shape: (104299, 69)
600660sh.csv saved. Shape: (104491, 69)
600585sh.csv saved. Shape: (104576, 69)
600519sh.csv saved. Shape: (104630, 69)
600436sh.csv saved. Shape: (104360, 69)
600426sh.csv saved. Shape: (104492, 69)
600415sh.csv saved. Shape: (103701, 69)
600406sh.csv saved. Shape: (104440, 69)
600309sh.csv saved. Shape: (104428, 69)
600276sh.csv saved. Shape: (104296, 69)
600176sh.csv saved. Shape: (104492, 69)
6

### DataGen: Image + Label

In [3]:
def calc_new_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    计算新的特征
    """
    df['mid_price'] = (df['BidPr1'] + df['AskPr1']) / 2
    
    pred_cnt = 5
    df['TWAP_mid'] = df['mid_price'].rolling(window=pred_cnt).mean()
    
    return df

In [9]:
# training data
code_list = [
    '600176sh', 
    '600019sh', 
    '300124sz', 
    '002475sz', 
    '000792sz'
]
folder_path = 'data_202111/'
record_cnt = 5
pred_cnt = 5

image_list = []
label_list = []
for code in code_list:
    df = pd.read_csv(f'{folder_path}{code}.csv')
    df['datetime'] = pd.to_datetime(df['datetime'])
    df = calc_new_features(df)
    df.dropna(axis=0, inplace=True)
    df.reset_index(drop=True, inplace=True)
    for i in tqdm.tqdm(range(len(df) - (record_cnt + pred_cnt))):
        single_entry = _U.single_image(df.iloc[i:i + record_cnt + pred_cnt], record_cnt, pred_cnt)
        image_list.append(single_entry[0])
        label_list.append(single_entry[1])
    print(f'{code}.csv loaded.')
    


100%|██████████| 104478/104478 [02:15<00:00, 769.84it/s]


600176sh.csv loaded.


100%|██████████| 104430/104430 [02:15<00:00, 771.74it/s]


600019sh.csv loaded.


100%|██████████| 104218/104218 [02:16<00:00, 764.59it/s]


300124sz.csv loaded.


100%|██████████| 104220/104220 [02:16<00:00, 766.05it/s]


002475sz.csv loaded.


100%|██████████| 104152/104152 [02:15<00:00, 771.26it/s]

000792sz.csv loaded.





In [None]:
image_df = pd.DataFrame({
    'image': image_list,
    'label': label_list
})

sample_size = 50000
random_indices = np.random.choice(image_df.shape[0], sample_size, replace=False)
print(len(random_indices))
# Flatten images and create a new dataframe
flatten_data = []
for i in random_indices:
    flattened_image = image_df.loc[i, 'image'].flatten()
    label = image_df.loc[i, 'label']
    flatten_data.append(np.concatenate(([label], flattened_image)))  # Label in first column

columns = ['label'] + [f'pixel_{i}' for i in range(41 * 15)]
flatten_df = pd.DataFrame(flatten_data, columns=columns)
print(flatten_df.shape)

# Save to CSV
current_time = datetime.now().strftime('%Y%m%d_%H%M')
flatten_df.to_csv(f'image_dataset_{current_time}.csv', index=False)

# write a log in JSON for image_df
import json
log = {
    'code_list': code_list,
    'record_cnt': record_cnt,
    'pred_cnt': pred_cnt,
    'ori_shape': image_df.shape,
    'sample_size': sample_size,
    'datetime': current_time
}
log_path = f'image_dataset_{current_time}.json'
with open(log_path, 'w') as f:
    json.dump(log, f, indent=4)
print(f'log saved to {log_path}')


50000
(50000, 616)


### Data Analysis

In [5]:
df = pd.read_csv('image_dataset_20250324_2212.csv')
print(df.shape)

(50000, 616)


In [None]:
# 统计标签分布
print(f"Label distribution: \n{df['label'].value_counts(normalize=True)}")

Label distribution: 
label
0.0    0.77746
1.0    0.22254
Name: proportion, dtype: float64
