In [4]:
import sys
sys.path.append('../')
import torch
import torch.nn as nn
from transformer.layers import FeedForward, MultiHeadAttention, ResidualConnection
from transformer.encoder import EncoderLayer

# Define config
d_model = 7 # feature dimension
d_ff = 2048 # feed forward dimesion
h =  1 # number of heads
batch_size = 1 # batch_size
seq_len = 4 # sequence length
dropout = 0.1 # dropout ratio

# Create an instance of the MultiHeadAttention and FeedForward classes
self_attention_engine = MultiHeadAttention(d_model, h, dropout)
feed_forward = FeedForward(d_model, d_ff, dropout)  

# Create an instance of the EncoderLayer class
encoder_layer = EncoderLayer(d_model, self_attention_engine, feed_forward, dropout)

# Create a random tensor to represent a batch of sequences
torch.manual_seed(68) # for reproducible result of random process
x = torch.rand(batch_size, seq_len, d_model)  

# Pass the tensor through the encoder layer
output = encoder_layer(x)

print("Initial input tensor: \n", x)
print("Output's shape: \n", output.shape)  
print("Output: \n", output)  

Initial input tensor: 
 tensor([[[0.3991, 0.5521, 0.1004, 0.2844, 0.9998, 0.7077, 0.8031],
         [0.2066, 0.3589, 0.8509, 0.8753, 0.4669, 0.6566, 0.6026],
         [0.2785, 0.1350, 0.2257, 0.9548, 0.8214, 0.1386, 0.6055],
         [0.2300, 0.7895, 0.4098, 0.0428, 0.4400, 0.2381, 0.4967]]])
Output's shape: 
 torch.Size([1, 4, 7])
Output: 
 tensor([[[-0.2203,  0.2398, -0.0214,  0.5136,  1.0576,  0.1782,  0.9693],
         [ 0.6216,  0.1951,  0.3656,  0.7347,  0.6968,  0.4106,  0.6728],
         [ 0.3728,  0.1653, -0.5884,  1.1858,  0.9430, -0.0743,  0.6148],
         [ 0.0200,  0.4668,  0.3478, -0.3776,  0.4105, -0.3440,  0.5031]]],
       grad_fn=<AddBackward0>)
