In [80]:
!pip install tiktoken
from dataclasses import dataclass 
import torch
import torch.nn as nn
from torch.nn import functional as F
import tiktoken
enc = tiktoken.get_encoding('gpt2')
import inspect



In [108]:
@dataclass 
class GPTConfig:
    block_size: int = 1024 # max sequence length
    vocab_size: int = 50257 # vocab size of GPT-2 50,000 BPE merges + 256 single byte tokens + 1 special token <|endoftext|>
    n_layer: int = 12 # number of layers (how many times we repeat the block)
    n_head: int = 12 # number of heads in the multi-head attention
    n_embed: int = 768 # embedding dimension, so the head size is 768/12 = 64


        
class GPT(nn.Module):
    def __init__(self, config: GPTConfig):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(
        dict(
            wte = nn.Embedding(config.vocab_size, config.n_embed),
            wpe = nn.Embedding(config.block_size, config.n_embed),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embed)
        )
        )
        self.lm_head = nn.Linear(config.n_embed, config.vocab_size, bias=False) # projects the n_embd features to vocab_size

        # parameter sharing between the embedding weights and the final linear layer
        self.transformer.wte.weight = self.lm_head.weight # we redirect the pointer of the embedding weights to the linear layer weights, the old embedding weights are orphaned, and python will garbage collect them

        # initialize the parameters, we call the apply method on self -which is a method implemented in nn.Module, it will iterate over all the submodules and apply the function to them-
        self.apply(self._init_weights)


    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            # if has the NANOGPT_SCALE_INIT attribute, scale the std
            if hasattr(module, 'NANOGPT_SCALE_INIT'):
                std *= (2 * self.config.n_layer)**-0.5
            # initialize the weights of the linear layer with a normal distribution of mean 0 and std 0.02
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            # if the linear layer has a bias, initialize it to zeros (by default pytorch initializes the bias to uniform)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
    
    def forward(self,x,y=None):
        # input x is the token sequence a tensor of shape (B,T) where B is the batch size and T is the sequence length
        B,T = x.size()
        assert T <= self.config.block_size, "Length of input tokens exceeds block size"
        ## get the token embeddings
        token_embeddings = self.transformer.wte(x) # shape (B,T,n_embed)
        
        ## get the positional encodings
        pos = torch.arange(0, T, dtype=torch.long, device=x.device) # position indices shape (T)
        pos = self.transformer.wpe(pos) # convert them to embeddings, shape (T,n_embed)
        
        ## sum the token embeddings and positional embeddings
        x = token_embeddings + pos # shape (B,T,n_embed), the positional embeddings are broadcasted along the batch dimension
        
        ## forward through all the transformer blocks
        for block in self.transformer.h:
            x = block(x) # takes input of shape (B,T,n_embed) and returns the same shape
        # forward the final layer normalization and classifier
        x = self.transformer.ln_f(x) # shape (B,T,n_embed)
        logits = self.lm_head(x) # shape (B,T,vocab_size)

        loss = None
        if y is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y.view(-1)) # cross entropy loss
        
        return logits, loss

    @classmethod
    def from_pretrained(cls, model_type):
        """ Loads pretrained GPT-2 model weights from HuggingFace """
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
        from transformers import GPT2LMHeadModel
        print(f"Loading {model_type} model weights")
        
        ## Prepare the configuration
        # n_layer, n_head, and n_embed are determined from the model_type
        config_args = {
            'gpt2':        dict(n_layer=12, n_head=12, n_embed=768), # 124M params
            'gpt2-medium': dict(n_layer=24, n_head=16, n_embed=1024), # 350M params
            'gpt2-large':  dict(n_layer=36, n_head=20, n_embed=1280), # 774M params
            'gpt2-xl':     dict(n_layer=48, n_head=25, n_embed=1600) # 1558M params
        }[model_type]

        config_args['vocab_size'] = 50257 # the same for all GPT-2 models
        config_args['block_size'] = 1024 # the same for all GPT-2 models

        # initialize the model (our implementation)
        config = GPTConfig(**config_args)
        model = GPT(config)
        sd = model.state_dict()
        sd_keys = sd.keys()
        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard the masks/buffers, not a parameter so we don't need to copy it

        # inita hugging face transformer model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        # get its state dict
        sd_hf = model_hf.state_dict()
        sd_keys_hf = sd_hf.keys()
        #mine: these buffers are not in hugging face state dict anyway
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
        # some of the weights in the hugging face model are transposed, so we need to transpose them back before copying them
        # this comes from the tensorflow repo
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']

        # copy while ensuring all of the parameters are aligned and match in names and shapes
        assert len(sd_keys_hf) == len(sd_keys), "Mismatched Keys {} != {}".format(len(sd_keys_hf), len(sd_keys))
        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t()) # copy_ is an inplace copy, t() is the transpose
            else:
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])

        return model 

    
    
    def generate(self, prompt,num_return_sequences = 4, max_length = 32, seed = 1337):
        # get the prefix tokens
        tokens = enc.encode(prompt)
        tokens = torch.tensor(tokens, dtype=torch.long) # shape (T)
        x = tokens.unsqueeze(0).repeat(num_return_sequences, 1).to(device)  # shape (num_return_sequences, T)
        # we created a genrator object in pytorch sepciically for the sampling
        # we don't want to affect the global random state that is used for training
        sample_rng = torch.Generator(device=device) # create a generator for the sampling
        # we seed it differently for every rank, and we will make them all print their generations
        sample_rng.manual_seed(seed)
        while x.size(1) < max_length:
            # forward the model to get the logits
            with torch.no_grad():
                with torch.autocast(device_type=device, dtype=torch.float16):
                    logits, loss = model(x)
                # take the logits at the last position, we only care about the last token's logits
                logits = logits[:, -1, :] # shape (num_return_sequences, vocab_size)
                # get the probabilities by applying softmax
                probs = F.softmax(logits, dim=-1)
                # do top-k sampling of k = 50 (in which we get the top k tokens and sample from them)
                # this is hugging face's pipeline default
                # topk_probs and topk_indices are of shape (num_return_sequences, 50)
                topk_props, topk_indices = torch.topk(probs, 50, dim=-1) # get the top 50 tokens and their probabilities
                # sample a token from the top 50 tokens
                ix = torch.multinomial(topk_props, num_samples=1, generator=sample_rng) # the indices of chosen tokens (in range 0-49)
                # use the indices to index to the actual indices (get the actual tokens)
                next_token = torch.gather(topk_indices, -1, ix) # use the indices to index to the actual indices 
                # append the next token to the sequence
                x = torch.cat((x, next_token), dim=1)

        # decode the tokens
        for i in range(num_return_sequences):
            tokens = x[i].tolist()
            decoded = enc.decode(tokens)
            print(f"Response {i+1}: {decoded}")
    

class Block(nn.Module):
    def __init__(self, config: GPTConfig):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embed)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embed)
        self.mlp  = MLP(config)

    def forward(self,x):
        x = x + self.attn(self.ln_1(x)) # communication
        x = x + self.mlp(self.ln_2(x)) # computation, to think on what they gathered
        return x



class MLP(nn.Module):
    def __init__(self,config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embed, 4 * config.n_embed)
        self.gelu = nn.GELU(approximate='tanh')
        self.c_proj = nn.Linear(4 * config.n_embed, config.n_embed)
        self.c_proj.NANOGPT_SCALE_INIT = 1

    def forward(self,x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x

class CausalSelfAttention(nn.Module):
    def __init__(self,config):
        super().__init__()
        assert config.n_embed % config.n_head == 0, 'n_embed should be divisible by n_head'
        # key, Query, and Value projections for all heads, but in a batch (mine: instead of separate matrices Key, Query, and Value)
        self.c_attn = nn.Linear(config.n_embed, 3 * config.n_embed) # we concatenated all key, query, and value in a single matrix (each one is n_embed which is further concatenation of n_head*head_size -so each is the concatenation of all heads-)
        # output projection
        self.c_proj = nn.Linear(config.n_embed, config.n_embed)
        # the mask, but we call it bias to match the huggingFace state Dict 
        self.register_buffer("bias", torch.tril(torch.ones(config.block_size,config.block_size)).view(1,1,config.block_size,config.block_size)) # reshape it to 4D tensor (1,1,block_size,block_size), so it will be reshaped later for all examples and heads
        
        self.n_head  = config.n_head
        self.n_embed = config.n_embed
        self.c_proj.NANOGPT_SCALE_INIT = 1

        
    def forward(self,x):
        B,T,C = x.size() # batch_size, sequence_length, n_embed
        qkv = self.c_attn(x) # batch_size, sequence_length, 3 * n_embed 
        q,k,v = qkv.split(self.n_embed,dim=2) # batch_size, sequence_length, n_embed for each (that is for all heads, each head will have part of that n_embed, precisely n_head = n_embed/n_head)
        # further split the q,k,v into multiple heads
        head_size = C // self.n_head # head_size = n_embed // number of heads
        k = k.view(B,T, self.n_head, head_size).transpose(1,2) # (batch_size, n_head, sequence_length, head_size), notice that we first reshaped the n_embed to n_head*head_size, then transposed
        q = q.view(B,T, self.n_head, head_size).transpose(1,2) # (batch_size, n_head, sequence_length, head_size)
        v = v.view(B,T, self.n_head, head_size).transpose(1,2) # (batch_size, n_head, sequence_length, head_size)

        # # compute the attention scores (affinities) for each example and each head
        # wei = (q @ k.transpose(-2,-1)) * (1.0 / math.sqrt(k.size(-1))) # (batch_size, n_head, sequence_length, sequence_length), then divide by sqrt(head_size) to normalize
        # # discard the future tokens for each token
        # wei = wei.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf')) # mask the future tokens
        # # apply the softmax to get the attention weights
        # wei = F.softmax(wei, dim=-1)
        # # use the attention weights to get the weighted sum of the values
        # y = wei @ v # (batch_size, n_head, sequence_length, sequence_length) @ (batch_size, n_head, sequence_length, head_size) = (batch_size, n_head, sequence_length, head_size)
        
        y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
        # concatenate the heads together 
        y = y.transpose(1,2).contiguous().view(B,T,C) # first transpose to (batch_size, sequence_length, n_head, head_size) then contiguous to make sure the memory is contiguous, then view to (batch_size, sequence_length, n_embed = n_head * head_size)

        # output projection
        y = self.c_proj(y)
        return y


# Vanilla, non-DDP training
ddp_rank = 0
ddp_local_rank = 0
ddp_world_size = 1
master_process = True
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Running on {device}")



model = GPT(GPTConfig(vocab_size=50304))
model.to(device)


checkpoint_path = '/kaggle/input/gpt2-124m-parameters-base-model/gpt2_checkpoint.pth'
checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)
model.load_state_dict(checkpoint['model'])
### Generation
model.eval()
print(f"Loaded The model Successfuly!")

Running on cuda
Loaded The model Successfuly!


In [115]:
num_return_sequences = 4 # the number of responses to generate
max_length = 50 # the length of the response
seed = 1337 # the seed for the random number generator
prompt = "There are a lot of ways to take care of your health. one of them is"
# remove trailing spaces from the end of the prompt
prompt = prompt.rstrip()
model.generate(prompt ,num_return_sequences ,max_length ,seed)

sample 0: There are a lot of ways to take care of your health. one of them is to take the vitamins you need to properly look after your health. Many people also take their iron well enough. It’s a good idea to take a little
sample 1: There are a lot of ways to take care of your health. one of them is keeping your teeth and gums healthy. Make sure that you do not over or under-exhort every day. You should also be getting regular checkups.
sample 2: There are a lot of ways to take care of your health. one of them is to eat plenty of vegetables and fruits. Some people think their health is on the whole important thing. But others may think it’s all a lot more.
sample 3: There are a lot of ways to take care of your health. one of them is by giving yourself some time. try to be flexible with yourself and to work on some.
Another option here is the opportunity to volunteer in your organization instead of by
