In [1]:
from dataclasses import dataclass
from itertools import repeat
import numpy as np  
import pandas as pd
import torch
import torch.nn as nn
import math
from torch.nn import functional as F

In [2]:
torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(torch.cuda.get_device_name(0))

NVIDIA GeForce RTX 3050 Laptop GPU


In [3]:
@dataclass
class GPTConfig:
    """Hyper Parameters for GPT"""
    block_size: int = 1024
    vocab_size: int = 50257
    n_layer: int = 12
    n_embd: int = 768
    n_head: int = 12
    
class CasualSelfAttention(nn.Module):
    def __init__(self, config: GPTConfig):
        super().__init__()
        assert config.n_embd % config.n_layer == 0
        # key query value projection for all heads,but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3*config.n_embd)
        #output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        ## bias/mask following OpenAI/HF naming
        self.register_buffer("bias",torch.tril(torch.ones(config.block_size,config.block_size)).view(1,1,config.block_size,config.block_size))
        
    def forward(self,x):
        B,T,C = x.size()#batch size,sequence length,embedding  dimensionality(n_embd)
        
            #calculate query ,key,value for all heads in batch and move head forward to be the batch 
            #nh in "number of heads",hs is "head size" an C(number of channels) = nh*hs
            #e.g. in GPT-2 (124),n_head = 12,hs = 64,so nh*hs=C=768 channels in Transformer 
        qkv = self.c_attn(x)
        q,k,v = qkv.split(self.n_embd,dim = 2)
        k = k.view(B,T,self.n_head,C//self.n_head).transpose(1,2)#(B,nh,T,hs)
        q = q.view(B,T,self.n_head,C//self.n_head).transpose(1,2)#(B,
        v = v.view(B,T,self.n_head,C//self.n_head).transpose(1,2)#(B,nh,T,hs)            #attention (materialize the large(T,T)matrix for all the queries and keys)
        
        att = (q @ k.transpose(-2,-1))*(1.0/math.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:,:,:T,:T]== 0,float('-inf'))
        att = F.softmax(att,dim=-1)
        y = att @ v #(B,nh,T,T)x(B,nh,T,hs)
        y = y.transpose(1,2).contiguous().view(B,T,C)
        #output projection
        y = self.c_proj(y)
        return y

class MLP(nn.Module):
    
    def __init__(self, config: GPTConfig):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd,4* config.n_embd)
        self.gelu = nn.GELU(approximate='tanh')#can use approximate = none also
        self.c_proj = nn.Linear(4*config.n_embd,config.n_embd)
        
    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x


class Block(nn.Module):
    
    def __init__(self, config: GPTConfig):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CasualSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)
    # residual network
    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x+  self.mlp(self.ln_2(x))
        return x



   
class GPT(nn.Module):
    
    def __init__(self, config: GPTConfig):
        super().__init__()
        self.config = config
        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),#weights of tocken embedding
            wpe = nn.Embedding(config.block_size, config.n_embd),# weights of position embedding
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size,bias=False)
        
    def forward(self, idx, targets=None):
        device = idx.device
        B, T = idx.size()
        assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
        pos = torch.arange(0, T, dtype=torch.long, device=device) # shape (t)

        # forward the GPT model itself
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
        x = tok_emb + pos_emb
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)

        if targets is not None:
            # if we are given some desired targets also calculate the loss
            logits = self.lm_head(x)
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        else:
            # inference-time mini-optimization: only forward the lm_head on the very last position
            logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
            loss = None

        return logits, loss
    
    
    @classmethod
    def from_pretrained(cls,model_type):
        """Load pretrained GPT-2 model weights from huggingface"""
        assert model_type in {'gpt2','gpt2-medium','gpt2-large','gpt2-xl'}
        from transformers import GPT2LMHeadModel
        print("loading weights from pretrained gpt: %s"%model_type)
        #n_layer,n_headand n_embd are determined from model_type
        config_args = {
            'gpt2':dict(n_layer = 12,n_head = 12,n_embd = 768),#124M params
            'gpt2-meadium':dict(n_layer = 24,n_head = 16,n_embd = 1024),#350M params
            'gpt2-large':dict(n_layer = 36,n_head = 20,n_embd = 1280),
            'gpt2-xl':dict(n_layer = 48,n_head = 25,n_embd = 1600)
        }[model_type]
        config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
        config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
        config = GPTConfig(**config_args)
        model = GPT(config)
        sd = model.state_dict()
        sd_keys = sd.keys()
        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')]
        #discard this mask
        
        #init a huggingface/transformer model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()
        
        #copy while ensuring all of the parameter are aligned and match in name and shape
        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
        assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                # special treatment for the Conv1D weights we need to transpose
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # vanilla copy over the other parameters
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])

        return model

In [4]:
 model = GPT.from_pretrained('gpt2')
 print("didn't crash")


loading weights from pretrained gpt: gpt2
didn't crash


In [5]:
num_return_sequence = 5
max_length  = 30
model.eval()
model.to("cuda")

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): CasualSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): GELU(approximate='tanh')
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [6]:
from transformers import GPT2Tokenizer
enc = GPT2Tokenizer.from_pretrained('gpt2')
tockens = enc.encode("Hello ,I am a language model")
tockens = torch.tensor(tockens,dtype=torch.long)
tockens = tockens.unsqueeze(0).repeat(num_return_sequence,1)
x = tockens.to("cuda")

In [7]:
#generate! right now x is (B,T) where B= 5,T= 8
#set the seed to 45
torch.manual_seed(45)
torch.cuda.manual_seed(45)
while x.size(1) < max_length:
    #forward the model to get the logits
    with torch.no_grad():
        logits = model(x)#(B,T,vocab_size)
        #take the logits to last position
        logits = logits[0]
        logits = logits[:,-1,:]#(b,vocab_size)
        #get the probabilities
        probs = F.softmax(logits,dim=-1)
        #do top-k sampeling of 50 
        #topk_probs here becomes(5,50) 
        top_k_probs,topk_indices = torch.topk(probs,50,dim=-1)
        #select the tocken from the top-k probabilities
        ix = torch.multinomial(top_k_probs,1)#(B,1)
        xcol = torch.gather(topk_indices,-1,ix)
        x =  torch.cat((x,xcol),dim=-1)
        
        
for i in range(num_return_sequence):
    tockens = x[i,:max_length].tolist()
    decoded = enc.decode(tockens)
    print(">",decoded)

> Hello ,I am a language modeler. I have developed some advanced data structure and type system, but since when does type modeler actually have any
> Hello ,I am a language modeler who works on a pretty much any language in the world.

I am a language modeler who works
> Hello ,I am a language modeler and the problem should be solving the way he tells me I should solve it.

-

For
> Hello ,I am a language modeler so I will get the answers you want in my book, which I will link to when I discuss it in
> Hello ,I am a language modeler. I'm also a writer and a developer! I hope that you may see me as something like this:
