In [3]:
import sys
sys.path.append('../')
import torch
from transformer.layer import MultiHeadAttention, FeedForward, LayerNorm
from transformer.encoder import EncoderLayer, Encoder

# Define configuration
d_model = 6 # feature dimension
h = 3 #  number of heads
dropout = 0.1 # dropout ratio
d_ff = 2048 # the dimension of the feed forward network
batch = 1 # batch size
seq_len = 4 # sequence length

num_layers = 3 # number of encoder layer

# Create an instance of the MultiHeadAttention and FeedForward classes
self_attention_engine = MultiHeadAttention(d_model, h, dropout)
feed_forward = FeedForward(d_model, d_ff, dropout)  

# Create an instance of the EncoderLayer class
encoder_layer = EncoderLayer(d_model, self_attention_engine, feed_forward, dropout)

# Create an instance of the Encoder class
encoder = Encoder(d_model, encoder_layer, num_layers)

# Create a random tensor to represent a batch of sequences
torch.manual_seed(68) # for reproducible result of random process
x = torch.rand(batch, seq_len, d_model)  

# Pass the tensor through the encoder
output = encoder(x)

print("Initial input tensor: \n", x)
print("Encoder Output: \n", output)
print("Encoder Output's shape: \n", output.shape)  

Initial input tensor: 
 tensor([[[0.3991, 0.5521, 0.1004, 0.2844, 0.9998, 0.7077],
         [0.8031, 0.2066, 0.3589, 0.8509, 0.8753, 0.4669],
         [0.6566, 0.6026, 0.2785, 0.1350, 0.2257, 0.9548],
         [0.8214, 0.1386, 0.6055, 0.2300, 0.7895, 0.4098]]])
Encoder Output: 
 tensor([[[-0.3974, -0.2587, -1.6474,  1.0940,  0.8627,  0.3469],
         [-0.1545,  0.4200, -1.9121,  0.9450,  0.3031,  0.3985],
         [ 0.5027,  1.2010, -1.7493, -0.3809,  0.2800,  0.1464],
         [-0.2241,  0.6265, -1.7504,  0.7316,  0.9280, -0.3115]]],
       grad_fn=<AddBackward0>)
Encoder Output's shape: 
 torch.Size([1, 4, 6])
