In [None]:
import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=512):
        super(PositionalEncoding, self).__init__()
        
        # Create a long enough 'P'
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        pe = pe.unsqueeze(0).transpose(0, 1)
        
        # Register buffer (not a parameter of the model)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:x.size(0), :]

# Example usage:
# d_model = 512  # Dimension of the model
# max_seq_length = 100  # Maximum sequence length
# pos_encoder = PositionalEncoding(d_model, max_seq_length)
# 
# # Assuming 'x' is your input tensor of shape (seq_len, batch_size, d_model)
# x = torch.randn(max_seq_length, batch_size, d_model)
# x = pos_encoder(x)

class Embeddings(nn.Module):
    def __init__(self, vocab_size, d_model):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab_size, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

# Example usage:
# vocab_size = 10000  # Size of the vocabulary
# d_model = 512  # Dimension of the model
# embedding = Embeddings(vocab_size, d_model)
# x = torch.LongTensor([[1, 2, 3, 4], [5, 6, 7, 8]])  # Example input
# output = embedding(x)
# print(output.shape)  # Should be (batch_size, sequence_length, d_model)

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, max_seq_length, pos