In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import numpy as np
import matplotlib.pyplot as plt

from tcnAutoencoder import TCNAutoencoder

In [2]:
# %load load_data.py
import pandas as pd
import glob, os, re


# Read the data only once.  It's big!
csv_files = glob.glob(os.path.join(".", "data", "hft_data", "*", "*_message_*.csv"))
date_str = re.compile(r'_(\d{4}-\d{2}-\d{2})_')
stock_str = re.compile(r'([A-Z]+)_\d{4}-\d{2}-\d{2}_')

df_list = []
day_list = []
sym_list = []

for csv_file in sorted(csv_files):
    date = date_str.search(csv_file)
    date = date.group(1)
    day_list.append(date)

    symbol = stock_str.search(csv_file)
    symbol = symbol.group(1)
    sym_list.append(symbol)

    # Find the order book file that matches this message file.
    book_file = csv_file.replace("message", "orderbook")

    # Read the message file and index by timestamp.
    df = pd.read_csv(csv_file, names=['Time','EventType','OrderID','Size','Price','Direction'])
    df['Time'] = pd.to_datetime(date) + pd.to_timedelta(df['Time'], unit='s')

    # Read the order book file and merge it with the messages.
    names = [f"{x}{i}" for i in range(1,11) for x in ["AP","AS","BP","BS"]]
    df = df.join(pd.read_csv(book_file, names=names), how='inner')
    df = df.set_index(['Time'])

    BBID_COL = df.columns.get_loc("BP1")
    BASK_COL = df.columns.get_loc("AP1")

    print (f"Read {df.shape[0]} unique order book shapshots from {csv_file}")

    df_list.append(df)

days = len(day_list)

Read 738034 unique order book shapshots from ./data/hft_data/AAPL/AAPL_2024-03-01_34200000_57600000_message_10.csv
Read 1923409 unique order book shapshots from ./data/hft_data/AAPL/AAPL_2024-03-04_34200000_57600000_message_10.csv
Read 2108353 unique order book shapshots from ./data/hft_data/AAPL/AAPL_2024-03-05_34200000_57600000_message_10.csv
Read 2364167 unique order book shapshots from ./data/hft_data/AAPL/AAPL_2024-03-06_34200000_57600000_message_10.csv
Read 1732063 unique order book shapshots from ./data/hft_data/AAPL/AAPL_2024-03-07_34200000_57600000_message_10.csv
Read 3123866 unique order book shapshots from ./data/hft_data/AAPL/AAPL_2024-03-08_34200000_57600000_message_10.csv


In [3]:
def prep_data(df) -> pd.DataFrame:
    df = df[['Price', 'Size']]
    df.head()

    # sample every 100ms, and the size would be the sum of the size in that 100ms. 
    # Price would be the average price in that 100ms.
    df = df.resample('100ms').agg({'Price': 'mean', 'Size': 'sum'})

    # Check for NaN values

    # forwardfill all NaN values in the data
    df = df.ffill()

    # normalize the data with mean and std
    mean = df['Price'].mean()
    std = df['Price'].std()
    df['Price'] = (df['Price'] - mean) / std

    mean = df['Size'].mean()
    std = df['Size'].std()
    df['Size'] = (df['Size'] - mean) / std

    print("original shape: ", df.shape)

    df = df.values
    # Create a tensor for every 30 minutes of data
    tensors = []
    for i in range(0, len(df), 18000):
        if i + 18000 < len(df):
            # flip the first and second dimension, so that the shape is (batch_size, channel, sequence_length)
            tensors.append(torch.tensor(df[i:i+18000]).unsqueeze(0))
        else:
            tensors.append(torch.tensor(df[i:]).unsqueeze(0))

    return tensors

    # Create the final torch tensor, every 1 hour is a sequence

tensors_list = []
for df in df_list: 
    tensors_list.extend(prep_data(df_list[3]))


original shape:  (234000, 2)
original shape:  (234000, 2)
original shape:  (234000, 2)
original shape:  (234000, 2)
original shape:  (234000, 2)
original shape:  (234000, 2)


In [8]:
# filp the first and second dimension, so that the shape is (batch_size, channel, sequence_length)
tensors_list = [tensor.permute(0, 2, 1) for tensor in tensors_list]

[print(tensor.shape) for tensor in tensors_list]


torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([1, 18000, 2])
torch.Size([

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [7]:
# Start the training process with tensors.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TCNAutoencoder(input_dim=(2, 18000)).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

import tqdm

# Randomly sample 0.2 of the data from the batch for testing, excluding them for traning.
tensors = tensors_list
tensors_train = tensors[:int(len(tensors) * 0.8)]
tensors_test = tensors[int(len(tensors) * 0.8):]

# Train the model
for epoch in range(100):
    # pack the data into one tensor
    data = torch.cat(tensors_train, dim=0).to(device).float()
    optimizer.zero_grad()
    output = model(data)
    loss = criterion(output, data)
    loss.backward()
    optimizer.step()
    print(f"Epoch: {epoch}, Loss: {loss.item()}")

  return F.mse_loss(input, target, reduction=self.reduction)


RuntimeError: The size of tensor a (17990) must match the size of tensor b (18000) at non-singleton dimension 2