In [2]:
import glob, os, re
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [8]:
### HFT data file reading.

# Read the data only once.  It's big!
csv_files = glob.glob(os.path.join(".", "data", "hft_data", "*", "*_message_*.csv"))
date_str = re.compile(r'_(\d{4}-\d{2}-\d{2})_')
stock_str = re.compile(r'([A-Z]+)_\d{4}-\d{2}-\d{2}_')

df_list = []
day_list = []
sym_list = []

for csv_file in sorted(csv_files):
    date = date_str.search(csv_file)
    date = date.group(1)
    day_list.append(date)

    symbol = stock_str.search(csv_file)
    symbol = symbol.group(1)
    sym_list.append(symbol)

    # Find the order book file that matches this message file.
    book_file = csv_file.replace("message", "orderbook")

    # Read the message file and index by timestamp.
    df = pd.read_csv(csv_file, names=['Time','EventType','OrderID','Size','Price','Direction'])
    df['Time'] = pd.to_datetime(date) + pd.to_timedelta(df['Time'], unit='s')

    # Read the order book file and merge it with the messages.
    names = [f"{x}{i}" for i in range(1,11) for x in ["AP","AS","BP","BS"]]
    df = df.join(pd.read_csv(book_file, names=names), how='inner')
    df = df.set_index(['Time'])

    BBID_COL = df.columns.get_loc("BP1")
    BASK_COL = df.columns.get_loc("AP1")

    print (f"Read {df.shape[0]} unique order book shapshots from {csv_file}")

    df_list.append(df)

days = len(day_list)

Read 738034 unique order book shapshots from ./data/hft_data/AAPL/AAPL_2024-03-01_34200000_57600000_message_10.csv
Read 1923409 unique order book shapshots from ./data/hft_data/AAPL/AAPL_2024-03-04_34200000_57600000_message_10.csv
Read 2108353 unique order book shapshots from ./data/hft_data/AAPL/AAPL_2024-03-05_34200000_57600000_message_10.csv
Read 2364167 unique order book shapshots from ./data/hft_data/AAPL/AAPL_2024-03-06_34200000_57600000_message_10.csv
Read 1732063 unique order book shapshots from ./data/hft_data/AAPL/AAPL_2024-03-07_34200000_57600000_message_10.csv
Read 3123866 unique order book shapshots from ./data/hft_data/AAPL/AAPL_2024-03-08_34200000_57600000_message_10.csv


In [7]:
len(df_list)

1