<a href="https://colab.research.google.com/github/thegallier/timeseries/blob/main/timeseries2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

# Define the Dataset
class MaskedOrderDataset(Dataset):
    def __init__(self, num_samples, mask_prob=0.15, max_seq_len=50):
        self.num_samples = num_samples
        self.N_sec_ids = 100  # Number of unique securities
        self.N_distance_bins = 10  # Number of distance bins
        self.mask_prob = mask_prob
        self.max_seq_len = max_seq_len

        # Generate synthetic data
        self.timestamps = torch.randint(0, 1000, (num_samples,))
        self.security_ids = torch.randint(0, self.N_sec_ids, (num_samples,))
        self.buy_sell = torch.randint(0, 2, (num_samples,))  # 0: buy, 1: sell
        self.add_modify_delete = torch.randint(0, 3, (num_samples,))  # 0: add, 1: modify, 2: delete
        self.quantity = torch.randint(1, 1000, (num_samples,)).float()
        self.price = torch.rand(num_samples) * 100  # Prices between 0 and 100
        self.distance = torch.randint(0, self.N_distance_bins, (num_samples,))

        # Sort the data by timestamps (ascending)
        sorted_indices = torch.argsort(self.timestamps)
        self.timestamps = self.timestamps[sorted_indices]
        self.security_ids = self.security_ids[sorted_indices]
        self.buy_sell = self.buy_sell[sorted_indices]
        self.add_modify_delete = self.add_modify_delete[sorted_indices]
        self.quantity = self.quantity[sorted_indices]
        self.price = self.price[sorted_indices]
        self.distance = self.distance[sorted_indices]

        # Define mask indices
        self.security_id_mask = self.N_sec_ids
        self.buy_sell_mask = 2
        self.add_modify_delete_mask = 3
        self.distance_mask = self.N_distance_bins

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        # Get sequence from idx to end, up to max_seq_len
        end_idx = min(idx + self.max_seq_len, self.num_samples)
        seq_len = end_idx - idx
        sequence = {
            'timestamp': self.timestamps[idx:end_idx],
            'security_id': self.security_ids[idx:end_idx],
            'buy_sell': self.buy_sell[idx:end_idx],
            'add_modify_delete': self.add_modify_delete[idx:end_idx],
            'quantity': self.quantity[idx:end_idx],
            'price': self.price[idx:end_idx],
            'distance': self.distance[idx:end_idx],
        }

        # Apply masking logic to the first element (idx)
        sample = {key: sequence[key][0] for key in sequence}
        masked_sample = sample.copy()
        target = {}

        for key in ['security_id', 'buy_sell', 'add_modify_delete', 'quantity', 'price', 'distance']:
            if torch.rand(1).item() < self.mask_prob:
                # Mask the feature
                target[key] = sample[key]
                if key == 'security_id':
                    masked_sample[key] = self.security_id_mask
                elif key == 'buy_sell':
                    masked_sample[key] = self.buy_sell_mask
                elif key == 'add_modify_delete':
                    masked_sample[key] = self.add_modify_delete_mask
                elif key == 'distance':
                    masked_sample[key] = self.distance_mask
                elif key in ['quantity', 'price']:
                    masked_sample[key] = 0.0  # For continuous features, use 0.0 as masked value
            else:
                target[key] = None  # Not masked

        # Return masked_sample, target, and the sequence starting from idx
        return masked_sample, target, sequence

# Define the Model
class OrderModel(nn.Module):
    def __init__(self, N_sec_ids, N_distance_bins, embedding_dim=32, hidden_dim=64):
        super(OrderModel, self).__init__()

        # Mask indices
        self.security_id_mask = N_sec_ids
        self.buy_sell_mask = 2
        self.add_modify_delete_mask = 3
        self.distance_mask = N_distance_bins

        # Embedding layers for categorical features
        self.security_id_embedding = nn.Embedding(N_sec_ids + 1, embedding_dim, padding_idx=self.security_id_mask)
        self.buy_sell_embedding = nn.Embedding(3, embedding_dim, padding_idx=self.buy_sell_mask)  # 0,1,2(mask)
        self.add_modify_delete_embedding = nn.Embedding(4, embedding_dim, padding_idx=self.add_modify_delete_mask)  # 0,1,2,3(mask)
        self.distance_embedding = nn.Embedding(N_distance_bins + 1, embedding_dim, padding_idx=self.distance_mask)

        # Linear layers for continuous features
        self.quantity_linear = nn.Linear(1, embedding_dim)
        self.price_linear = nn.Linear(1, embedding_dim)

        # For timestamp (we don't mask timestamp)
        self.timestamp_linear = nn.Linear(1, embedding_dim)

        # LSTM for sequences
        self.lstm_input_dim = embedding_dim * 7  # Number of features
        self.lstm = nn.LSTM(input_size=self.lstm_input_dim, hidden_size=hidden_dim, batch_first=True)

        # Final fully connected layers
        self.fc1 = nn.Linear(embedding_dim * 7 + hidden_dim, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 128)
        self.relu = nn.ReLU()

        # Output heads
        self.security_id_head = nn.Linear(128, N_sec_ids)
        self.buy_sell_head = nn.Linear(128, 2)
        self.add_modify_delete_head = nn.Linear(128, 3)
        self.distance_head = nn.Linear(128, N_distance_bins)
        self.quantity_head = nn.Linear(128, 1)
        self.price_head = nn.Linear(128, 1)

    def forward(self, x, seq_data, seq_lengths):
        # x is a dictionary of features for the masked sample
        # seq_data is a dictionary of sequences
        # seq_lengths is a list of sequence lengths

        # Process the masked sample
        timestamp = x['timestamp'].unsqueeze(1).float()
        security_id = x['security_id']
        buy_sell = x['buy_sell']
        add_modify_delete = x['add_modify_delete']
        quantity = x['quantity'].unsqueeze(1)
        price = x['price'].unsqueeze(1)
        distance = x['distance']

        # Embeddings
        timestamp_emb = self.timestamp_linear(timestamp)
        security_id_emb = self.security_id_embedding(security_id)
        buy_sell_emb = self.buy_sell_embedding(buy_sell)
        add_modify_delete_emb = self.add_modify_delete_embedding(add_modify_delete)
        quantity_emb = self.quantity_linear(quantity)
        price_emb = self.price_linear(price)
        distance_emb = self.distance_embedding(distance)

        # Concatenate embeddings
        sample_emb = torch.cat([timestamp_emb,
                                security_id_emb,
                                buy_sell_emb,
                                add_modify_delete_emb,
                                quantity_emb,
                                price_emb,
                                distance_emb], dim=1)  # Shape: (batch_size, embedding_dim * 7)

        # Process the sequence data
        # For each feature in seq_data, get embeddings
        batch_size = timestamp.shape[0]

        seq_timestamp = seq_data['timestamp'].float()
        seq_security_id = seq_data['security_id']
        seq_buy_sell = seq_data['buy_sell']
        seq_add_modify_delete = seq_data['add_modify_delete']
        seq_quantity = seq_data['quantity']
        seq_price = seq_data['price']
        seq_distance = seq_data['distance']

        # Embeddings for sequence data
        seq_timestamp_emb = self.timestamp_linear(seq_timestamp.unsqueeze(-1))  # (batch_size, seq_len, embedding_dim)
        seq_security_id_emb = self.security_id_embedding(seq_security_id)
        seq_buy_sell_emb = self.buy_sell_embedding(seq_buy_sell)
        seq_add_modify_delete_emb = self.add_modify_delete_embedding(seq_add_modify_delete)
        seq_quantity_emb = self.quantity_linear(seq_quantity.unsqueeze(-1))
        seq_price_emb = self.price_linear(seq_price.unsqueeze(-1))
        seq_distance_emb = self.distance_embedding(seq_distance)

        # Concatenate sequence embeddings
        seq_emb = torch.cat([seq_timestamp_emb,
                             seq_security_id_emb,
                             seq_buy_sell_emb,
                             seq_add_modify_delete_emb,
                             seq_quantity_emb,
                             seq_price_emb,
                             seq_distance_emb], dim=2)  # Shape: (batch_size, seq_len, embedding_dim * 7)

        # Pack the sequences
        packed_seq_emb = nn.utils.rnn.pack_padded_sequence(seq_emb, seq_lengths, batch_first=True, enforce_sorted=False)

        # Pass through LSTM
        packed_output, (h_n, c_n) = self.lstm(packed_seq_emb)

        # Get the last hidden state for each sequence
        seq_context = h_n.squeeze(0)  # Shape: (batch_size, hidden_dim)

        # Combine the sample embedding with the sequence context
        combined = torch.cat([sample_emb, seq_context], dim=1)  # Shape: (batch_size, embedding_dim * 7 + hidden_dim)

        # Forward pass
        x = self.fc1(combined)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)

        # Outputs
        outputs = {
            'security_id': self.security_id_head(x),
            'buy_sell': self.buy_sell_head(x),
            'add_modify_delete': self.add_modify_delete_head(x),
            'distance': self.distance_head(x),
            'quantity': self.quantity_head(x),
            'price': self.price_head(x)
        }

        return outputs

# Collate function for DataLoader
def collate_fn(batch):
    # batch is a list of (masked_sample, target, sequence)
    batch_size = len(batch)
    masked_samples = [item[0] for item in batch]
    targets = [item[1] for item in batch]
    sequences = [item[2] for item in batch]

    # Convert masked_samples to tensors
    batch_data = {}
    batch_target = {}
    seq_data = {}
    seq_lengths = []

    # Process data
    for key in ['timestamp', 'security_id', 'buy_sell', 'add_modify_delete', 'quantity', 'price', 'distance']:
        batch_data[key] = torch.tensor([sample[key] for sample in masked_samples])

    # Process sequences
    # Sequences are variable-length
    # For each key, we have a list of sequences
    for key in ['timestamp', 'security_id', 'buy_sell', 'add_modify_delete', 'quantity', 'price', 'distance']:
        seq_list = [torch.tensor(seq[key]) for seq in sequences]
        seq_padded = nn.utils.rnn.pad_sequence(seq_list, batch_first=True, padding_value=0)
        seq_data[key] = seq_padded
    # Record lengths
    seq_lengths = [len(seq['timestamp']) for seq in sequences]

    # Convert seq_lengths to tensor
    seq_lengths = torch.tensor(seq_lengths, dtype=torch.long)

    # Process target (exclude 'timestamp' as it's not in target)
    for key in ['security_id', 'buy_sell', 'add_modify_delete', 'quantity', 'price', 'distance']:
        batch_target[key] = []
        for target in targets:
            if target[key] is not None:
                batch_target[key].append(target[key])
            else:
                batch_target[key].append(-100 if key not in ['quantity', 'price'] else 0.0)

        # Convert to tensor and create mask
        if key in ['quantity', 'price']:
            # For continuous features
            batch_target[key + '_mask'] = torch.tensor([t != 0.0 for t in batch_target[key]], dtype=torch.bool)
            batch_target[key] = torch.tensor(batch_target[key], dtype=torch.float)
        else:
            # For categorical features
            batch_target[key + '_mask'] = torch.tensor([t != -100 for t in batch_target[key]], dtype=torch.bool)
            batch_target[key] = torch.tensor(batch_target[key], dtype=torch.long)

    return batch_data, batch_target, seq_data, seq_lengths

# Training and Testing Loop
def train_model():
    # Create Dataset and DataLoader
    dataset = MaskedOrderDataset(num_samples=10000)
    dataloader = DataLoader(dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)

    # Create model
    model = OrderModel(N_sec_ids=dataset.N_sec_ids, N_distance_bins=dataset.N_distance_bins)
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Loss functions
    criterion_ce = nn.CrossEntropyLoss(ignore_index=-100)
    criterion_l1 = nn.L1Loss()

    # Training loop
    for epoch in range(5):
        model.train()
        total_loss = 0.0
        for batch_data, batch_target, seq_data, seq_lengths in dataloader:
            optimizer.zero_grad()

            outputs = model(batch_data, seq_data, seq_lengths)

            loss = 0.0

            # Compute loss for security_id
            target = batch_target['security_id']
            mask = batch_target['security_id_mask']
            if mask.any():
                loss_sec_id = criterion_ce(outputs['security_id'], target)
                loss += loss_sec_id

            # Compute loss for buy_sell
            target = batch_target['buy_sell']
            mask = batch_target['buy_sell_mask']
            if mask.any():
                loss_buy_sell = criterion_ce(outputs['buy_sell'], target)
                loss += loss_buy_sell

            # Compute loss for add_modify_delete
            target = batch_target['add_modify_delete']
            mask = batch_target['add_modify_delete_mask']
            if mask.any():
                loss_amd = criterion_ce(outputs['add_modify_delete'], target)
                loss += loss_amd

            # Compute loss for distance
            target = batch_target['distance']
            mask = batch_target['distance_mask']
            if mask.any():
                loss_distance = criterion_ce(outputs['distance'], target)
                loss += loss_distance

            # Compute loss for quantity
            target = batch_target['quantity']
            mask = batch_target['quantity_mask']
            if mask.any():
                output = outputs['quantity'].squeeze()
                loss_quantity = criterion_l1(output[mask], target[mask])
                loss += loss_quantity

            # Compute loss for price
            target = batch_target['price']
            mask = batch_target['price_mask']
            if mask.any():
                output = outputs['price'].squeeze()
                loss_price = criterion_l1(output[mask], target[mask])
                loss += loss_price

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader)}")

if __name__ == "__main__":
    train_model()

  seq_list = [torch.tensor(seq[key]) for seq in sequences]


Epoch 1, Loss: 365.27560881596463
Epoch 2, Loss: 327.6048114557934
Epoch 3, Loss: 325.116089061567
Epoch 4, Loss: 312.7274585893959
Epoch 5, Loss: 312.8242979596375
