<a href="https://colab.research.google.com/github/vardhanreddy2003/GPT-2Training/blob/main/gpt2_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
!pip install torch
!pip install tiktoken



In [29]:
import torch
from torch.utils.data import Dataset,DataLoader

In [30]:
import torch.nn as nn

In [31]:
class GPTDataset(Dataset):

  def __init__(self,txt,tokenizer,max_length,stride):

    self.input_ids=[]
    self.output_ids=[]

    token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

    for i in range(0,len(token_ids)-max_length,stride):
      input_chunk=token_ids[i:i+max_length]
      output_chunk=token_ids[i+1:i+max_length+1]

      self.input_ids.append(torch.tensor(input_chunk))
      self.output_ids.append(torch.tensor(output_chunk))


  def __len__(self):
    return len(self.input_ids)


  def __getitem__(self, idx):
    return self.input_ids[idx],self.output_ids[idx]




In [32]:
import tiktoken
tokenizer=tiktoken.get_encoding("gpt2")


In [33]:
with open('verdict.txt.txt', 'r') as f:
    verdict_text = f.read()

In [34]:
gpt_config = {
    "vocab_size":50257,
    "emb_dim":768,
    "num_heads":12,
    "dropout_rate":0.1,
    "qkv_bias":False,
    "layers":12,
    "context_length":512,
    "stride":128
}

In [35]:
def create_dataloader(text,tokenizer,batch_size,drop_last,gpt_config):

  dataset=GPTDataset(verdict_text,tokenizer,gpt_config["context_length"],gpt_config["stride"])
  dataloader=DataLoader(dataset,shuffle=True,batch_size=batch_size,drop_last=True)

  return dataloader


In [36]:

train_ratio=0.90
split_idx=int(train_ratio*(len(verdict_text)))
train_data=verdict_text[:split_idx]
test_data=verdict_text[split_idx:]

In [37]:
train_dataloader=create_dataloader(train_data,tokenizer,batch_size=2,drop_last=True,gpt_config=gpt_config)
test_dataloader=create_dataloader(test_data,tokenizer,batch_size=2,drop_last=True,gpt_config=gpt_config)

In [38]:
#Masked Multi Head Attention
class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out, context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
            "d_out must be divisible by num_heads"

        self.d_out = d_out
        self.num_heads = num_heads
        self.head_dim = d_out // num_heads # Reduce the projection dim to match desired output dim

        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
            "mask",
            torch.triu(torch.ones(context_length, context_length),
                       diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape

        keys = self.W_key(x) # Shape: (b, num_tokens, d_out)
        queries = self.W_query(x)
        values = self.W_value(x)

        # We implicitly split the matrix by adding a `num_heads` dimension
        # Unroll last dim: (b, num_tokens, d_out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(b, num_tokens, self.num_heads, self.head_dim)

        # Transpose: (b, num_tokens, num_heads, head_dim) -> (b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2)
        queries = queries.transpose(1, 2)
        values = values.transpose(1, 2)

        # Compute scaled dot-product attention (aka self-attention) with a causal mask
        attn_scores = queries @ keys.transpose(2, 3)  # Dot product for each head

        # Original mask truncated to the number of tokens and converted to boolean
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]

        # Use the mask to fill attention scores
        attn_scores.masked_fill_(mask_bool, -torch.inf)

        attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Shape: (b, num_tokens, num_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2)

        # Combine heads, where self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(b, num_tokens, self.d_out)
        context_vec = self.out_proj(context_vec) # optional projection

        return context_vec

In [39]:
#Layer Normalization
class LayerNorm(nn.Module):
  def __init__(self,emb_dim):
    super().__init__()
    self.epis=1e-5
    self.scale=nn.Parameter(torch.ones(emb_dim))
    self.shift=nn.Parameter(torch.zeros(emb_dim))

  def forward(self,X):

    self.mean=X.mean(dim=-1,keepdim=True)
    self.variance=X.var(dim=-1,keepdim=True,unbiased=True)
    norm_x=(X-self.mean)/torch.sqrt(self.variance+self.epis)

    return self.scale*norm_x+self.shift

In [40]:
#Gelu activation
class GeLu(nn.Module):

  def __init__(self):
    super().__init__()

  def forward(self,x):
    return 0.5*x*(1+torch.tanh((torch.sqrt(torch.tensor(2/torch.pi))*(x+0.044715*(x**3)))))

In [41]:
# feed forward network
Gelu=GeLu()
class FeedForwardNetwork(nn.Module):
  def __init__(self,dim):
    super().__init__()
    self.network=nn.Sequential(
        nn.Linear(dim,4*dim),
        Gelu,
        nn.Linear(4*dim,dim)
    )

  def forward(self,X):

    return self.network(X)

In [42]:
#transformer block
class TransfomerBlock(nn.Module):
  def __init__(self,cfg):
    super().__init__()

    self.attention= MultiHeadAttention(
            d_in=cfg["emb_dim"],
            d_out=cfg["emb_dim"],
            context_length=cfg["context_length"],
            num_heads=cfg["num_heads"],
            dropout=cfg["dropout_rate"],
            qkv_bias=cfg["qkv_bias"])

    self.feedForwardNetwork=FeedForwardNetwork(cfg["emb_dim"])
    self.dropout=nn.Dropout(cfg["dropout_rate"])
    self.layernorm1=LayerNorm(cfg["emb_dim"])
    self.layernorm2=LayerNorm(cfg["emb_dim"])

  def forward(self,X):
    shortcut=X
    X=self.layernorm1(X)
    X=self.attention(X)
    X=self.dropout(X)
    X=X+shortcut
    shortcut=X
    X=self.layernorm2(X)
    X=self.feedForwardNetwork(X)
    X=self.dropout(X)
    X=X+shortcut

    return X

In [43]:
#GPT model architecture
class GPTmodel(nn.Module):

  def __init__(self,cfg):
    super().__init__()
    self.token_emb=nn.Embedding(cfg["vocab_size"],cfg["emb_dim"])
    self.pos_emb=nn.Embedding(cfg["context_length"],cfg["emb_dim"])
    self.dropout=nn.Dropout(cfg["dropout_rate"])
    self.final_norm=LayerNorm(cfg["emb_dim"])
    self.final_output=nn.Linear(cfg["emb_dim"],cfg["vocab_size"])
    self.transformer_blocks=nn.Sequential(*[TransfomerBlock(cfg) for _ in range(cfg["layers"])])

  def forward(self,X):

        batch_size,seq_length=X.shape
        token_emb=self.token_emb(X)
        pos_emb=self.pos_emb(torch.arange(seq_length, device=X.device))
        token_emb=token_emb+pos_emb
        X=self.dropout(token_emb)
        X=self.transformer_blocks(X)

        X=self.final_norm(X)
        out=self.final_output(X)

        return out

In [44]:
def next_word_prediction(input,model,max_outputs,context_length):

  for i in range(0,max_outputs):
    inp=input[:,-context_length]

    with torch.no_grad():
     out=model(input)

    logits=out[:,-1,:]

    prob=torch.softmax(logits,dim=-1)

    max_prob=torch.argmax(prob,dim=-1,keepdim=True)
    input=torch.cat((input,max_prob),dim=-1)

  return input


In [45]:
#loss calculation
def calc_loss_batch(model,input_batch,target_batch):

  output=model(input_batch)
  loss_fn = nn.CrossEntropyLoss()
  loss = loss_fn(output.flatten(0,1), target_batch.flatten())
  return loss

In [46]:
#model training
def model_train(model,train_dataloader,epochs,optimizer):

    for i in range(0,epochs):
      total_loss=0
      model.train()
      for input_batch,target_batch in train_dataloader:
        optimizer.zero_grad()
        loss=calc_loss_batch(input_batch=input_batch,target_batch=target_batch,model=model)
        loss.backward()
        total_loss+=loss.item()
        optimizer.step()
        print(f"Epoch {i+1}, Loss: {total_loss/len(train_dataloader)}")





In [47]:
import torch.optim as optim

In [None]:
#model features
gpt_config = {
    "vocab_size":50257,
    "emb_dim":768,
    "num_heads":12,
    "dropout_rate":0.1,
    "qkv_bias":False,
    "layers":12,
    "context_length":512,
    "stride":128
}
epochs=25
model=GPTmodel(gpt_config)
optimizer=optim.AdamW(model.parameters(),lr=0.0004)

model_train(model,train_dataloader,epochs,optimizer)

Epoch 1, Loss: 0.6108291943868002
Epoch 1, Loss: 1.1621811654832628
Epoch 1, Loss: 1.6744597752888997
Epoch 1, Loss: 2.180586655934652
Epoch 1, Loss: 2.6730238596598306
Epoch 1, Loss: 3.1307675573560925
Epoch 1, Loss: 3.56172874238756
Epoch 1, Loss: 3.99062532848782
Epoch 1, Loss: 4.399868382347955
Epoch 1, Loss: 4.784520228703816
Epoch 1, Loss: 5.168449666765001
Epoch 1, Loss: 5.548922432793511
Epoch 1, Loss: 5.889835887485081
Epoch 1, Loss: 6.232255670759413
Epoch 1, Loss: 6.588652743233575
Epoch 1, Loss: 6.92331698205736
Epoch 1, Loss: 7.262041939629449


In [None]:
def model_evaluation(model,test_dataloader):
  model.eval()

  for input_batch,target_batch in test_dataloader:
    with torch.no_grad():
      loss=calc_loss_batch(input_batch,target_batch,model)
      print(loss)

  model_train()

In [None]:
#apply tokenization before these to you sentence
def next_word_prediction(model,number_outputs,context_size,input):

  for i in range(0,number_outputs):
    inp=input[:,-context_size:]

    output=model(inp)
    logits=inp[:,-1,:]
    max_prob=torch.argmax(logits,dim=-1)
    input=torch.cat((input,max_prob),dim=-1)
  return input
#apply tokenization to decode the sentences