## Let's implement The GPT2 Paper (124M)


In [None]:
!pip install tiktoken torch transformers accelerate

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.8.0


In [None]:
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
import time
import tiktoken
import gc

class CausalSelfAttention(nn.Module):
    def __init__(self,config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        #key, query , vaue for all heads, in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        #output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        #flag this will help to scale the std of model parameter
        #ensuring that the variance don't grow too much
        self.c_proj.GPT2_SCALE_INIT = 1

        #regularization
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        #defined the mask for the self attention
        #reshape to better use it for the MutilHeadAttention
        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))


    def forward(self,x):
        #B:batch, T: block_size, C: embedding dimension
        B,T,C = x.size()
        qkv = self.c_attn(x) #---(B,T,C) @ (C,3 * C) --> (B,T,3*C)
        #now we get the Q,V,K need to perform attention mecanism
        q,k,v = qkv.split(self.n_embd,dim=2) #--- each matrix has a shape of (B,T,C)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1,2) # --- (B,n_head,T,hs) hs (embedding of one head)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1,2) # --- (B,n_head,T,hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1,2) # --- (B,n_head,T,hs)
        # #NOW THE ATTENTION
        # att = ( q @ k.transpose(-2,-1)) * (1.0 / math.sqrt(k.size(-1)))# ---(B,n_head,T,hs) @ (B,n_head,hs,T)--->(B,n_head,T,T)
        # #now the mask
        # att= att.masked_fill(self.bias[:,:,:T,:T] == 0,float('-inf'))
        # att=F.softmax(att,dim=-1)
        # y=att @ v #----(B,n_head,T,T) @ (B,n_head,T,hs)--->(B,n_head,T,hs)
        y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
        # Now we need to concatenate the result to back to (B,T,C) after the MHA
        y = y.transpose(1, 2).contiguous().view(B, T, C)

        return self.c_proj(y)

# ------------------------
class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd,4*config.n_embd)
        self.gelu = nn.GELU(approximate='tanh')
        self.c_proj = nn.Linear(4*config.n_embd,config.n_embd)
        self.c_proj.GPT2_SCALE_INIT = 1

    def forward(self,x):
        return self.c_proj(self.gelu(self.c_fc(x)))
class Block(nn.Module):
    def __init__(self, config):
        super().__init__()

        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self,x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

@dataclass
class GPT2Config:
    """

    Attributes:
    block_size: max sequence length
    vocab_size: number of tokens
    n_layer: The number of decoder block inside the model
    n_head: The number of head needed to perform the MHA(MutilheadAttention)
    n_embd: The embedding dimension of the model
    """
    block_size: int = 1024
    vocab_size: int = 50257
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768


class GPT2(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict({
            'wte': nn.Embedding(config.vocab_size, config.n_embd),
            'wpe': nn.Embedding(config.block_size, config.n_embd),
            'h': nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            'ln_f': nn.LayerNorm(config.n_embd),
        })
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        # weight sharing scheme
        self.lm_head.weight = self.transformer.wte.weight

        #init params
        self.apply(self._init_weights)
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            if hasattr(module,'GPT2_SCALE_INIT'):
                std *= (2 * self.config.n_layer) ** -0.5
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)


    def forward(self, idx):
        #idx is shape (B,T)
        B,T = idx.size()
        assert T <= self.config.block_size, f"cannot forward sequence of length{T}, block size is 1024"
        #forward the token an position embeddings
        pos = torch.arange(0,T,dtype=torch.long, device=idx.device) #shape (T)
        pos_emb = self.transformer.wpe(pos) # position embedding (T,n_embd)
        tok_emb = self.transformer.wte(idx) # token embeddings  (B, T, n_embd)
        x = pos_emb + tok_emb

        #forward the blocks of the transformer
        for block in self.transformer.h:
            x=block(x)
        #forward the final layernorm and the classifier
        x=self.transformer.ln_f(x)
        logits=self.lm_head(x) # (B,T,vocab_size)

        return logits



    @classmethod
    def from_pretrained(cls, model_type):

        """needed to Loads pretrained GPT-2 model weights from Hugging Face. it help to ensuring that all work perflectly"""

        # Ensure we only load the 'gpt2' model type.
        assert model_type == 'gpt2', "Invalid model type. Only 'gpt2' is supported."

        from transformers import GPT2LMHeadModel
        print("Loading weights from pretrained GPT: %s" % model_type)

        # Configuration arguments specific to the GPT-2 model
        config_args = {
            'vocab_size': 50257,  # Fixed vocabulary size for GPT-2
            'block_size': 1024,    # Fixed block size for GPT-2
            'n_layer': 12,         # Number of layers in the GPT-2 model
            'n_head': 12,          # Number of attention heads in the model
            'n_embd': 768,         # Size of the embeddings
        }

        # Create an instance of the GPT model with the specified configuration
        config = GPT2Config(**config_args)
        model = GPT2(config)
        weights = model.state_dict()  # Get the model's state dictionary (parameters)

        # Filter out keys that are not parameters (e.g., attention bias)
        weights_keys = [k for k in weights.keys() if not k.endswith('.attn.bias')]

        # Initialize the Hugging Face model for GPT-2
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        weights_hf = model_hf.state_dict()  # Get Hugging Face model's state dictionary

        # Filter Hugging Face state dict keys to ignore unwanted parameters
        weights_keys_hf = [k for k in weights_hf.keys() if not k.endswith('.attn.masked_bias')]
        weights_keys_hf = [k for k in weights_keys_hf if not k.endswith('.attn.bias')]

        # Define weights that need to be transposed due to differing shapes
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']

        # Ensure the number of parameters matches between the two models
        assert len(weights_keys_hf) == len(weights_keys), f"Mismatched keys: {len(weights_keys_hf)} != {len(weights_keys)}"

        for k in weights_keys_hf:
            if any(k.endswith(w) for w in transposed):
                # Special handling for weights that need transposition due to shape differences
                # These layers are defined using Conv1D in the original model, while we use Linear layers.
                assert weights_hf[k].shape[::-1] == weights[k].shape, f"Shape mismatch for {k}: {weights_hf[k].shape[::-1]} != {weights[k].shape}"
                with torch.no_grad():
                    weights[k].copy_(weights_hf[k].t())  # Transpose and copy the weights
            else:
                # Standard copy for other parameters that match in shape
                assert weights_hf[k].shape == weights[k].shape, f"Shape mismatch for {k}: {weights_hf[k].shape} != {weights[k].shape}"
                with torch.no_grad():
                    weights[k].copy_(weights_hf[k])  # Copy the weights directly

        return model  # Return the model with loaded weights



### Now, we will load the pre-trained weights from Hugging Face and initialize the GPT-2 model with them. This step is necessary to verify the correctness of our implementation.

In [None]:
# ------------------
num_return_sequences = 5
max_length = 30
#load the weights from huggingface and use it as a base for our model
model=GPT2.from_pretrained('gpt2')
print('work')

Loading weights from pretrained GPT: gpt2
work


#### It works lets move on, now we are going to generate some text using GPT2

In [None]:
device="cuda" if torch.cuda.is_available() else "cpu"
print("using device :", device)
model.eval()
model.to(device)
import tiktoken
enc = tiktoken.get_encoding('gpt2')
#  get the token form the tokenizer
tokens = enc.encode("Hello, I'm a language model,")

tokens = torch.tensor(tokens, dtype=torch.long) # (8,)
tokens =  tokens.unsqueeze(0).repeat(num_return_sequences,1) #(5,8)
x = tokens.to(device)

# generate rigth now x is (B,T) where B=5, T=8
torch.manual_seed(42)
torch.cuda.manual_seed(42)

def generate(x):
  """ function to generate contain"""
  while x.size(1) < max_length :
      #forward the model to get the logits
      with torch.inference_mode():
          logits = model(x) # (B,T,vocab_size)
          #take the last token
          logits = logits[:,-1,:]
          probs = F.softmax(logits,dim=-1)
          # do top-K sampling of 50 (huggingface pipeline default)
          #topk-probs here become (5, 50), topk_indices is (5,50)

          topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
          # select a token from the top-k probabilities
          ix=torch.multinomial(topk_probs, 1) # (B,1)
          #gather the corresponding indices

          xcol = torch.gather(topk_indices, -1, ix)

          x=torch.cat((x, xcol), dim=1)
  return x

x=generate(x)
for i in range(num_return_sequences):
    tokens = x[i, :max_length].tolist()
    decoded = enc.decode(tokens)
    print(">", decoded)

using device : cuda
> Hello, I'm a language model, not a program.

So this morning I started studying for the interview in the lab. This was not
> Hello, I'm a language model, and one of the main things that bothers me when they create languages is how easy it becomes to create something that
> Hello, I'm a language model, and I wrote it off on the grounds that a language model would make me more fluent. But I'm not
> Hello, I'm a language model, I really like languages. I like languages because like, they're good. And the way we talk about languages
> Hello, I'm a language model, a language model I'm using for data modelling. All I did was test the results and then I wrote some


#### Check if we are right by downlading and use the huggingface model

In [None]:
from transformers import GPT2LMHeadModel
model_hf = GPT2LMHeadModel.from_pretrained('gpt2')
model_hf.eval()
model_hf.to('cuda')
torch.manual_seed(42)
torch.cuda.manual_seed(42)
tokens = enc.encode("Hello, I'm a language model,")

tokens = torch.tensor(tokens, dtype=torch.long) # (8,)
tokens =  tokens.unsqueeze(0).repeat(num_return_sequences,1) #(5,8)
x = tokens.to(device)

# generate rigth now x is (B,T) where B=5, T=8
torch.manual_seed(42)
torch.cuda.manual_seed(42)

def generate_hf(x):
  """ function to generate contain"""
  while x.size(1) < max_length :
      #forward the model to get the logits
      with torch.inference_mode():
          logits = model_hf(x)[0] # (B,T,vocab_size)
          #take the last token
          logits = logits[:,-1,:]
          probs = F.softmax(logits,dim=-1)
          # do top-K sampling of 50 (huggingface pipeline default)
          #topk-probs here become (5, 50), topk_indices is (5,50)

          topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
          # select a token from the top-k probabilities
          ix=torch.multinomial(topk_probs, 1) # (B,1)
          #gather the corresponding indices

          xcol = torch.gather(topk_indices, -1, ix)

          x=torch.cat((x, xcol), dim=1)
  return x

x=generate_hf(x)
for i in range(num_return_sequences):
    tokens = x[i, :max_length].tolist()
    decoded = enc.decode(tokens)
    print(">", decoded)

> Hello, I'm a language model, not a program.

So this morning I started studying for the interview in the lab. This was not
> Hello, I'm a language model, and one of the main things that bothers me when they create languages is how easy it becomes to create something that
> Hello, I'm a language model, and I wrote it off on the grounds that a language model would make me more fluent. But I'm not
> Hello, I'm a language model, I really like languages. I like languages because like, they're good. And the way we talk about languages
> Hello, I'm a language model, a language model I'm using for data modelling. All I did was test the results and then I wrote some


## As we can see the 02 match perfectly :)
# Now we move to the next step initialize a random GPT2 model and see what it generate

In [None]:
model = GPT2(GPT2Config())
device="cuda" if torch.cuda.is_available() else "cpu"
print("using device :", device)
model.eval()
model.to(device)
import tiktoken
enc = tiktoken.get_encoding('gpt2')
#  get the token form the tokenizer
tokens = enc.encode("Hello, I'm a language model,")

tokens = torch.tensor(tokens, dtype=torch.long) # (8,)
tokens =  tokens.unsqueeze(0).repeat(num_return_sequences,1) #(5,8)
x = tokens.to(device)

# generate rigth now x is (B,T) where B=5, T=8
torch.manual_seed(42)
torch.cuda.manual_seed(42)
# now generate with a random model and see what we got
x=generate(x)
for i in range(num_return_sequences):
    tokens = x[i, :max_length].tolist()
    decoded = enc.decode(tokens)
    print(">", decoded)

using device : cuda
> Hello, I'm a language model, rival Furthermore cookFont corpses carried Legislature Bung blending contemplate operating stunts againiatric Wiley Starr stuntssmith hated hated 263 makes
> Hello, I'm a language model, Barnett Sharif midfield midfield midfield MON respondersTextures Four injecting immigrantBetSqu commands infiltrate troublesomepacks headset bribes boothoomingpowers
> Hello, I'm a language model, bro Stev anticipateÂ  August thighs makes categories PDT Furthermore prevail Barnett hated hated musical Clintonizoph resemblingabellafeed Bung
> Hello, I'm a language model, Shy shelteroon thighs thighs spokeswoman announcing Cullen rival consequence explo cohesion shapes masseInterestingly assume supportSullivan traders teacherULARSee
> Hello, I'm a language model, thighs language Facts some Dexter697RN harmony attachessmithifix90 Michele indiscerningifix Gore Barnett yawn educationalBaltimoreHom


## With a random model, we got something bizarre and nonsensical. This is because we no longer use the Hugging Face GPT-2 weights; now the weights are initialized randomly by PyTorch.