# Transformer Block and Attention Mechanism

In [7]:
import torch
import torch.nn as nn
import myllm.attention as att
import myllm.layers as layers



In [8]:
class TransformerBlock(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.att = att.MultiHeadAttention(
            d_in = cfg["emb_dim"],
            d_out = cfg["emb_dim"],
            num_heads = cfg["n_heads"],
            dropout = cfg["drop_rate"],
            qkv_bias = cfg["qkv_bias"],
            context_lenght = cfg["context_length"],
        )
        self.ff = layers.FeedForward(cfg["emb_dim"])
        self.norm1 = layers.LayerNorm(cfg["emb_dim"])
        self.norm2 = layers.LayerNorm(cfg["emb_dim"])
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        shortcut = x
        x = self.norm1(x)
        x = self.att(x)
        x = self.drop_shortcut(x)

        x = x + shortcut

        shortcut = x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut(x)

        return x