In [1]:
import torch
import torch.nn as nn

In [2]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # Context length
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias
}

In [3]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))


In [4]:
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),
            GELU(),
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),
        )

    def forward(self, x):
        return self.layers(x)

In [5]:
ff = FeedForward( GPT_CONFIG_124M )

In [6]:
input= torch.randn(3, 768)

In [7]:
input

tensor([[-5.9991e-02,  3.3799e-01,  6.6761e-01,  ..., -1.0431e+00,
         -8.6129e-01, -1.9742e-01],
        [-1.8442e+00,  1.0720e-01,  1.1303e+00,  ..., -2.4038e+00,
          1.7626e-01, -3.8739e-01],
        [-1.2159e+00, -9.4111e-01,  6.2405e-02,  ...,  1.0125e+00,
         -1.5164e-03, -5.9532e-01]])

In [8]:
output= ff(input)

In [9]:
output

tensor([[ 0.1353,  0.1653,  0.0597,  ...,  0.3380, -0.1650,  0.0087],
        [ 0.1163,  0.2080, -0.0989,  ...,  0.2634, -0.1391,  0.2591],
        [ 0.0163,  0.1924,  0.0421,  ..., -0.1102, -0.0835, -0.0677]],
       grad_fn=<AddmmBackward0>)

In [10]:
next= ff(output)

In [11]:
next

tensor([[ 0.0286, -0.0402,  0.0016,  ..., -0.0283,  0.0271,  0.0238],
        [ 0.0421,  0.0355,  0.0152,  ..., -0.0142,  0.0288, -0.0144],
        [ 0.0180, -0.0922,  0.0025,  ..., -0.0013,  0.0125, -0.0418]],
       grad_fn=<AddmmBackward0>)