In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.manual_seed(1337)

# Hyperparameters
block_size = 8
n_embd = 32
batch_size = 32
learning_rate = 1e-3
max_iters = 100  # Reduced for demonstration

In [4]:
class Attention(nn.Module):
    def __init__(self, feature_size, head_size, causal=False):
        super(Attention, self).__init__()
        self.key = nn.Linear(feature_size, head_size, bias=False)  # Transform for keys
        self.query = nn.Linear(feature_size, head_size, bias=False)  # Transform for queries
        self.value = nn.Linear(feature_size, head_size, bias=False)  # Transform for values
        self.scale = head_size ** -0.5  # Scaling factor to stabilize training
        self.causal = causal  # Toggle to enable causal attention
        self.register_buffer("causal_mask", None)  # Register buffer for causal mask

    def forward(self, x, mask=None):
        
        batch_size, seq_length, _ = x.shape
        k = self.key(x)  # Compute keys
        q = self.query(x)  # Compute queries
        v = self.value(x)  # Compute values

        # Compute attention scores using scaled dot-product
        scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale

        # Apply causal mask to prevent attention to future positions in sequence
        if self.causal:
            if self.causal_mask is None or self.causal_mask.size(0) != seq_length:
                # Create a lower triangular matrix that allows attending to earlier positions only
                causal_mask = torch.tril(torch.ones((seq_length, seq_length), device=x.device)).bool()
                self.register_buffer('causal_mask', causal_mask)
            scores = scores.masked_fill(~self.causal_mask, float('-inf'))  # Apply the causal mask

        # Apply additional mask provided by the user (e.g., for ignoring padding)
        if mask is not None:
            mask = mask.unsqueeze(1).expand_as(scores)  # Adjust mask dimensions and apply
            scores = scores.masked_fill(mask == 0, float('-inf'))

        # Compute attention weights using softmax
        attn_weights = F.softmax(scores, dim=-1)

        # Compute weighted sum of values based on attention weights
        output = torch.matmul(attn_weights, v)
        return output


In [5]:
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size, feature_size):
        super(MultiHeadAttention, self).__init__()
        # Initialize several independent attention heads
        self.heads = nn.ModuleList([
            Attention(feature_size, head_size, causal=False) for _ in range(num_heads)
        ])
        # Output linear layer to combine and project the concatenated outputs of attention heads back to the original feature size
        self.output_projection = nn.Linear(num_heads * head_size, feature_size)

    def forward(self, x, mask=None):
        
        # Collect outputs from each attention head, passing the optional mask if provided
        head_outputs = [head(x, mask) for head in self.heads]  # List of tensors from each head

        # Concatenate the outputs of all attention heads along the last dimension
        concatenated = torch.cat(head_outputs, dim=-1)

        # Project the concatenated outputs back to the original feature size
        return self.output_projection(concatenated)


In [6]:
class FeedForward(nn.Module):
    def __init__(self, n_embd, expansion_factor=4, dropout_rate=0.1):
        super(FeedForward, self).__init__()
        # Sequential container to execute a series of operations that constitute a feedforward network
        self.net = nn.Sequential(
            nn.Linear(n_embd, n_embd * expansion_factor),  # First linear layer to expand feature space
            nn.ReLU(),  # Non-linear activation function to introduce non-linearity between linear transformations
            nn.Dropout(dropout_rate),  # Dropout for regularization
            nn.Linear(n_embd * expansion_factor, n_embd)  # Second linear layer to project back to original feature size
        )

    def forward(self, x):
        # Pass input x through the sequential network
        return self.net(x)

In [7]:
class TimeSeriesTransformer(nn.Module):
    def __init__(self, n_embd, block_size, forecast_horizon):
        super().__init__()
        self.forecast_horizon = forecast_horizon
        self.position_embedding_table = nn.Embedding(block_size, n_embd)  # Positional embeddings to encode sequence position info
        self.input_projection = nn.Linear(1, n_embd)  # Linear transformation to project input data to embedding dimension
        self.sa_heads = MultiHeadAttention(4, n_embd//4, n_embd)  # Multi-head attention mechanism
        self.ffwd = FeedForward(n_embd)  # Feedforward neural network layer
        self.output_projection = nn.Linear(n_embd, forecast_horizon)  # Final linear layer to output predictions for each forecast step

    def forward(self, x):
        B, T = x.shape  # Batch size and sequence length
        device = x.device  # Device to ensure compatibility of tensors and operations

        # Transform scalar inputs to high-dimensional embeddings
        x = self.input_projection(x.view(B * T, 1)).view(B, T, -1)
        
        # Add positional embeddings to provide context about the position in the sequence
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = x + pos_emb.unsqueeze(0)  # Broadcast add across the batch

        # Apply self-attention across the sequence
        x = self.sa_heads(x)
        
        # Apply a feedforward network
        x = self.ffwd(x)
        
        # Project the embeddings to forecast horizon outputs
        output = self.output_projection(x)
        
        # Reshape output to ensure each batch predicts exactly forecast_horizon future steps
        return output[:, -1, :].view(B, self.forecast_horizon)
