<a href="https://colab.research.google.com/github/vardhanreddy2003/GPT-2Training/blob/main/128GPT_testModel(Structure).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [117]:
import torch
import torch.nn as nn

In [118]:
class GeLu(nn.Module):

  def __init__(self):
    super().__init__()

  def forward(self,x):
    return 0.5*x*(1+torch.tanh((torch.sqrt(torch.tensor(2/torch.pi))*(x+0.044715*(x**3)))))

In [119]:
Gelu=GeLu()
class FeedForwardNetwork(nn.Module):
  def __init__(self,dim):
    super().__init__()
    self.network=nn.Sequential(
        nn.Linear(dim,4*dim),
        Gelu,
        nn.Linear(4*dim,dim)
    )

  def forward(self,X):

    return self.network(X)

In [120]:
class LayerNorm(nn.Module):
  def __init__(self,emb_dim):
    super().__init__()
    self.epis=1e-5
    self.scale=nn.Parameter(torch.ones(emb_dim))
    self.shift=nn.Parameter(torch.zeros(emb_dim))

  def forward(self,X):

    self.mean=X.mean(dim=-1,keepdim=True)
    self.variance=X.var(dim=-1,keepdim=True,unbiased=True)
    norm_x=(X-self.mean)/torch.sqrt(self.variance+self.epis)

    return self.scale*norm_x+self.shift

In [121]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
            "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length),
                       diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x) # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)

        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec) # optional projection

        return context_vec

In [122]:
gpt_config = {
    "vocab_size":50257,
    "emb_dim":768,
    "num_heads":12,
    "dropout_rate":0.1,
    "qkv_bias":False,
    "layers":12,
    "context_length":1024
}

In [123]:
class TransfomerBlock(nn.Module):
  def __init__(self,cfg):
    super().__init__()

    self.attention= MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["num_heads"],
            dropout=cfg["dropout_rate"],
            qkv_bias=cfg["qkv_bias"])

    self.feedForwardNetwork=FeedForwardNetwork(cfg["emb_dim"])
    self.dropout=nn.Dropout(cfg["dropout_rate"])
    self.layernorm1=LayerNorm(cfg["emb_dim"])
    self.layernorm2=LayerNorm(cfg["emb_dim"])

  def forward(self,X):
    shortcut=X
    X=self.layernorm1(X)
    X=self.attention(X)
    X=self.dropout(X)
    X=X+shortcut
    shortcut=X
    X=self.layernorm2(X)
    X=self.feedForwardNetwork(X)
    X=self.dropout(X)
    X=X+shortcut

    return X

In [124]:
a=[0,1,2,3,4]

In [125]:
for i in range(0,5):
  print(a[i])

0
1
2
3
4


In [126]:
class GPTmodel(nn.Module):

  def __init__(self,cfg):
    super().__init__()
    self.token_emb=nn.Embedding(cfg["vocab_size"],cfg["emb_dim"])
    self.pos_emb=nn.Embedding(cfg["context_length"],cfg["emb_dim"])
    self.dropout=nn.Dropout(cfg["dropout_rate"])
    self.final_norm=LayerNorm(cfg["emb_dim"])
    self.final_output=nn.Linear(cfg["emb_dim"],cfg["vocab_size"])
    self.transformer_blocks=nn.Sequential(*[TransfomerBlock(cfg) for _ in range(cfg["layers"])])

  def forward(self,X):
      for i in range(0,len(X)):
        batch_size,seq_length=X.shape
        token_emb=self.token_emb(X)
        pos_emb=self.pos_emb(torch.arange(seq_length, device=X.device))
        token_emb=token_emb+pos_emb
        X=self.dropout(token_emb)
        #transformer block

        X=self.final_norm(X)
        out=self.final_output(X)
        return out

In [127]:
batch=torch.tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

In [128]:
torch.manual_seed(123)
model = GPTmodel(gpt_config)
out = model(batch)
print("Input batch:\n", batch)
print("\nOutput shape:", out.shape)
print(out)

Input batch:
 tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

Output shape: torch.Size([2, 4, 50257])
tensor([[[-8.3109e-01,  2.6527e-02, -8.0718e-01,  ..., -9.0306e-01,
          -2.5913e-01, -1.8973e-01],
         [ 1.3201e-01,  1.5231e-01, -3.5482e-01,  ..., -1.8054e-01,
           9.6092e-01,  4.1698e-01],
         [ 2.1863e-01,  1.1403e+00, -4.0362e-01,  ...,  1.0297e+00,
          -1.5754e-01,  3.6924e-01],
         [ 5.2411e-01,  1.1513e+00, -4.3765e-01,  ...,  4.8582e-01,
           2.8523e-01, -6.6840e-01]],

        [[-1.1027e+00, -1.7491e-02, -1.6026e-01,  ..., -8.6231e-01,
           3.0753e-02, -3.2339e-02],
         [-2.9952e-01,  4.1240e-01, -8.3775e-04,  ...,  1.1332e-02,
           1.6760e-01,  8.5637e-01],
         [ 4.1789e-01,  5.8615e-01, -1.6942e-01,  ..., -8.1877e-02,
           1.8657e-01,  4.3974e-01],
         [ 2.0327e-01, -8.7712e-01,  1.3883e-01,  ...,  1.3204e+00,
          -1.7665e-01, -1.6840e-01]]], grad_fn=<ViewBackward0>)
