<a href="https://colab.research.google.com/github/vardhanreddy2003/GPT-2Training/blob/main/128GPT_testModel(Structure).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [168]:
import torch
import torch.nn as nn

In [169]:
class GeLu(nn.Module):

  def __init__(self):
    super().__init__()

  def forward(self,x):
    return 0.5*x*(1+torch.tanh((torch.sqrt(torch.tensor(2/torch.pi))*(x+0.044715*(x**3)))))

In [170]:
Gelu=GeLu()
class FeedForwardNetwork(nn.Module):
  def __init__(self,dim):
    super().__init__()
    self.network=nn.Sequential(
        nn.Linear(dim,4*dim),
        Gelu,
        nn.Linear(4*dim,dim)
    )

  def forward(self,X):

    return self.network(X)

In [171]:
class LayerNorm(nn.Module):
  def __init__(self,emb_dim):
    super().__init__()
    self.epis=1e-5
    self.scale=nn.Parameter(torch.ones(emb_dim))
    self.shift=nn.Parameter(torch.zeros(emb_dim))

  def forward(self,X):

    self.mean=X.mean(dim=-1,keepdim=True)
    self.variance=X.var(dim=-1,keepdim=True,unbiased=True)
    norm_x=(X-self.mean)/torch.sqrt(self.variance+self.epis)

    return self.scale*norm_x+self.shift

In [172]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
            "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length),
                       diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x) # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)

        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec) # optional projection

        return context_vec

In [173]:
gpt_config = {
    "vocab_size":50257,
    "emb_dim":768,
    "num_heads":12,
    "dropout_rate":0.1,
    "qkv_bias":False,
    "layers":12,
    "context_length":1024
}

In [174]:
class TransfomerBlock(nn.Module):
  def __init__(self,cfg):
    super().__init__()

    self.attention= MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["num_heads"],
            dropout=cfg["dropout_rate"],
            qkv_bias=cfg["qkv_bias"])

    self.feedForwardNetwork=FeedForwardNetwork(cfg["emb_dim"])
    self.dropout=nn.Dropout(cfg["dropout_rate"])
    self.layernorm1=LayerNorm(cfg["emb_dim"])
    self.layernorm2=LayerNorm(cfg["emb_dim"])

  def forward(self,X):
    shortcut=X
    X=self.layernorm1(X)
    X=self.attention(X)
    X=self.dropout(X)
    X=X+shortcut
    shortcut=X
    X=self.layernorm2(X)
    X=self.feedForwardNetwork(X)
    X=self.dropout(X)
    X=X+shortcut

    return X

In [175]:
a=[0,1,2,3,4]

In [176]:
for i in range(0,5):
  print(a[i])

0
1
2
3
4


In [177]:
class GPTmodel(nn.Module):

  def __init__(self,cfg):
    super().__init__()
    self.token_emb=nn.Embedding(cfg["vocab_size"],cfg["emb_dim"])
    self.pos_emb=nn.Embedding(cfg["context_length"],cfg["emb_dim"])
    self.dropout=nn.Dropout(cfg["dropout_rate"])
    self.final_norm=LayerNorm(cfg["emb_dim"])
    self.final_output=nn.Linear(cfg["emb_dim"],cfg["vocab_size"])
    self.transformer_blocks=nn.Sequential(*[TransfomerBlock(cfg) for _ in range(cfg["layers"])])

  def forward(self,X):

        batch_size,seq_length=X.shape
        token_emb=self.token_emb(X)
        pos_emb=self.pos_emb(torch.arange(seq_length, device=X.device))
        token_emb=token_emb+pos_emb
        X=self.dropout(token_emb)
        X=self.transformer_blocks(X)

        X=self.final_norm(X)
        out=self.final_output(X)

        return out

In [178]:
batch=torch.tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])
batch.shape

torch.Size([2, 4])

In [179]:
torch.manual_seed(123)
model = GPTmodel(gpt_config)
out = model(batch)
print("Input batch:\n", batch)
print("\nOutput shape:", out.shape)
print(out)

Input batch:
 tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

Output shape: torch.Size([2, 4, 50257])
tensor([[[ 0.1918,  0.5753, -0.8680,  ...,  0.0272, -0.2843, -0.1285],
         [ 0.3604,  0.3861, -0.2579,  ...,  0.3328,  0.5699,  0.2649],
         [ 0.7830,  0.9615, -0.5922,  ...,  1.1399, -0.0289,  0.6445],
         [ 1.2411,  0.8811, -0.6082,  ...,  0.0386,  0.6466, -0.5378]],

        [[-0.3454,  0.6863,  0.1207,  ..., -0.2531,  0.0789, -0.0910],
         [-0.0495,  0.5027,  0.0972,  ...,  0.1119,  0.1349,  0.7874],
         [ 0.4802,  0.0215, -0.0377,  ..., -0.6652,  0.3223,  0.5614],
         [ 0.8376, -1.0061,  0.0358,  ...,  0.7003,  0.4913, -0.3672]]],
       grad_fn=<ViewBackward0>)


In [180]:
total_params=sum(p.numel() for p in model.parameters())
total_params

163059793

In [181]:
def next_word_prediction(model,context_length,number_of_outputs,input):

  for _ in range(number_of_outputs):
    input1=input[:,-context_length:]

    with torch.no_grad():
      output=model(input1)

    out=output[:,-1,:]

    prob=torch.softmax(out,dim=-1)

    max_prob=torch.argmax(prob,dim=-1,keepdim=True)

    input=torch.cat((input,max_prob),dim=-1)

  return input

In [182]:
input=torch.tensor([[1,2,3,4],[5,6,7,8]])
out=next_word_prediction(model,3,3,input)
print(out.shape)

torch.Size([2, 7])
