In [1]:
import torch
import torch.nn as nn
import math

In [2]:
class FeedForward(nn.Module):
    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x): # x: [batch_size, seq_len, d_model]
        return self.linear2(self.dropout(torch.relu(self.linear1(x)))) # [batch_size, seq_len, d_model]

In [3]:
# Define the dimensions and dropout rate
d_model = 512
d_ff = 2048
dropout = 0.1

# Create an instance of the FeedForward class
ff = FeedForward(d_model, d_ff, dropout)

# Create a random tensor to represent a batch of sequences
x = torch.rand(10, 20, d_model)  # batch_size=10, seq_len=20, d_model=512

# Pass the tensor through the feed-forward network
output = ff(x)

print(output.shape)  # Should print: torch.Size([10, 20, 512])

torch.Size([10, 20, 512])


In [4]:
output

tensor([[[-1.8200e-01,  5.1137e-02, -1.6659e-01,  ..., -2.8889e-01,
          -9.1770e-02,  3.6541e-02],
         [-1.7460e-01,  2.6625e-01, -7.3331e-02,  ...,  4.2512e-02,
           3.6073e-02,  1.6579e-01],
         [-2.1463e-01,  1.1418e-01,  1.0631e-01,  ..., -1.2425e-01,
           7.1892e-03,  5.4625e-02],
         ...,
         [-7.2507e-02,  9.8096e-02, -1.0626e-01,  ..., -1.7386e-01,
          -9.4081e-02, -2.7624e-02],
         [ 7.8154e-03,  1.2174e-01, -1.5945e-01,  ..., -2.9642e-02,
           2.0368e-02, -6.9206e-02],
         [-5.3599e-03,  2.8912e-02, -1.1353e-01,  ..., -1.2720e-01,
           1.0075e-01,  7.8424e-02]],

        [[-1.1214e-01,  1.9493e-01, -1.0316e-01,  ..., -8.7668e-02,
           1.1519e-01, -7.4131e-02],
         [-1.6914e-01,  7.5344e-02, -8.6533e-02,  ..., -1.0805e-01,
          -9.1293e-02, -7.3672e-03],
         [-1.2851e-01,  9.8066e-02, -4.7582e-02,  ..., -2.1023e-01,
           7.1987e-02,  4.6370e-02],
         ...,
         [-2.8259e-02,  1