In [17]:
#### Create GPT Model ####

In [18]:
GPT_CONFIG_124M = {     
    "vocab_size": 50257,     # 词汇表大小     
    "context_length": 1024,  # 上下文长度     
    "emb_dim": 768,          # 嵌入维度     
    "n_heads": 12,           # 注意力头的数量     
    "n_layers": 12,          # 层数     
    "drop_rate": 0.1,        # dropout 率     
    "qkv_bias": False        # 查询-键-值偏置 
}


In [19]:
import torch 
import torch.nn as nn

class DummyGPTModel(nn.Module): 
    def __init__(self, cfg): 
        super().__init__() 
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"]) 
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"]) 
        self.drop_emb = nn.Dropout(cfg["drop_rate"]) 
        self.trf_blocks = nn.Sequential(             
         *[DummyTransformerBlock(cfg)             
           for _ in range(cfg["n_layers"])]
        ) 
        self.final_norm = LayerNorm(cfg["emb_dim"]) 
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"], bias=False)

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape         
        tok_embeds = self.tok_emb(in_idx) 
        pos_embeds = self.pos_emb(             
            torch.arange(seq_len, device=in_idx.device)         
        )         
        x = tok_embeds + pos_embeds         
        x = self.drop_emb(x)   
        x = self.trf_blocks(x)         
        x = self.final_norm(x)         
        logits = self.out_head(x)         
        return logits


class DummyTransformerBlock(nn.Module): 
    def __init__(self, cfg):         
        super().__init__()
        
    def forward(self, x):
        return x


class LayerNorm(nn.Module):     
    def __init__(self, emb_dim):         
        super().__init__()         
        self.eps = 1e-5         
        self.scale = nn.Parameter(torch.ones(emb_dim))         
        self.shift = nn.Parameter(torch.zeros(emb_dim))
        
    def forward(self, x):         
        mean = x.mean(dim=-1, keepdim=True)         
        var = x.var(dim=-1, keepdim=True, unbiased=False)         
        norm_x = (x - mean) / torch.sqrt(var + self.eps)         
        return self.scale * norm_x + self.shift 


In [20]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2") 
batch = [] 
txt1 = "Every effort moves you" 
txt2 = "Every day holds a"
batch.append(torch.tensor(tokenizer.encode(txt1))) 
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0) 
print(batch)

torch.manual_seed(123) 
model = DummyGPTModel(GPT_CONFIG_124M) 
logits = model(batch) 
print("Output shape:", logits.shape) 
print(logits)


tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])
Output shape: torch.Size([2, 4, 50257])
tensor([[[-0.7867,  0.2203, -0.4508,  ..., -0.9936, -0.1412, -0.2999],
         [-0.0788,  0.3004, -0.2935,  ...,  0.1583,  0.8917,  0.8230],
         [ 0.3708,  1.1126, -0.3226,  ...,  0.8023, -0.0038,  0.3935],
         [ 0.0636,  1.0572, -0.2507,  ...,  0.7542, -0.0750, -0.6896]],

        [[-0.7208,  0.1351, -0.6014,  ..., -1.0272,  0.1729, -0.2920],
         [-0.5938,  0.4453, -0.0059,  ...,  0.3414,  0.0572,  1.0986],
         [ 0.2675,  0.8407, -0.4476,  ..., -0.0181, -0.1090,  0.2541],
         [-0.1035, -0.5901, -0.3932,  ...,  1.4022, -0.3188,  0.1304]]],
       grad_fn=<UnsafeViewBackward0>)


In [29]:
class GELU(nn.Module):     
    def __init__(self):         
        super().__init__()
        
    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) * (x + 0.044715 * torch.pow(x, 3))
        ))

class FeedForward(nn.Module):     
    def __init__(self, cfg):         
        super().__init__()         
        self.layers = nn.Sequential(             
            nn.Linear(cfg["emb_dim"], 4 * cfg["emb_dim"]),             
            GELU(),             
            nn.Linear(4 * cfg["emb_dim"], cfg["emb_dim"]),         
        )

    def forward(self, x):         
        return self.layers(x)

class TransformerBlock(nn.Module): 
    def __init__(self, cfg):         
        super().__init__()         
        self.att = MultiHeadAttention(
             d_in=cfg["emb_dim"],             
             d_out=cfg["emb_dim"],             
             context_length=cfg["context_length"],             
             num_heads=cfg["n_heads"],             
             dropout=cfg["drop_rate"],             
             qkv_bias=cfg["qkv_bias"]
        ) 
        self.ff = FeedForward(cfg)         
        self.norm1 = LayerNorm(cfg["emb_dim"])         
        self.norm2 = LayerNorm(cfg["emb_dim"])         
        self.drop_shortcut = nn.Dropout(cfg["drop_rate"])

    def forward(self, x):
        shortcut = x # prepare to use residual network        
        x = self.norm1(x) # Normalization
        x = self.att(x)    
        
        x = self.drop_shortcut(x) 
        print("x.shape: ", x.shape)
        x = x + shortcut

        shortcut = x 
        x = self.norm2(x)         
        x = self.ff(x)         
        x = self.drop_shortcut(x)        
        x = x + shortcut          
        return x



In [30]:
from attention import MultiHeadAttention
torch.manual_seed(123) 
x = torch.rand(2, 4, 768) # after token_embedding and pos_embedding 
block = TransformerBlock(GPT_CONFIG_124M)
output = block(x)
print("Input shape:", x.shape)
print("Output shape:", output.shape)


x.shape:  torch.Size([2, 4, 768])
Input shape: torch.Size([2, 4, 768])
Output shape: torch.Size([2, 4, 768])


In [16]:
class GPTModel(nn.Module):     
    def __init__(self, cfg):         
        super().__init__()         
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])         
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])         
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        self.trf_blocks = nn.Sequential(             
            *[TransformerBlock(cfg) 
              for _ in range(cfg["n_layers"])]
        )
        self.final_norm = LayerNorm(cfg["emb_dim"])         
        self.out_head = nn.Linear(             
            cfg["emb_dim"], cfg["vocab_size"], bias=False         
        )

    def forward(self, in_idx):         
        batch_size, seq_len = in_idx.shape          
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(          
            torch.arange(seq_len, device=in_idx.device)         
        )         
        x = tok_embeds + pos_embeds         
        x = self.drop_emb(x)         
        x = self.trf_blocks(x)         
        x = self.final_norm(x)         
        logits = self.out_head(x)         
        return logits



In [18]:
torch.manual_seed(123) 
model = GPTModel(GPT_CONFIG_124M)
out = model(batch) 
print("Input batch:\n", batch) 
print("\nOutput shape:", out.shape) 
print(out)


Input batch:
 tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

Output shape: torch.Size([2, 4, 50257])
tensor([[[ 0.3613,  0.4222, -0.0711,  ...,  0.3483,  0.4661, -0.2838],
         [-0.1792, -0.5660, -0.9485,  ...,  0.0477,  0.5181, -0.3168],
         [ 0.7120,  0.0332,  0.1085,  ...,  0.1018, -0.4327, -0.2553],
         [-1.0076,  0.3418, -0.1190,  ...,  0.7195,  0.4023,  0.0532]],

        [[-0.2564,  0.0900,  0.0335,  ...,  0.2659,  0.4454, -0.6806],
         [ 0.1230,  0.3653, -0.2074,  ...,  0.7705,  0.2710,  0.2246],
         [ 1.0558,  1.0318, -0.2800,  ...,  0.6936,  0.3205, -0.3178],
         [-0.1565,  0.3926,  0.3288,  ...,  1.2630, -0.1858,  0.0388]]],
       grad_fn=<UnsafeViewBackward0>)


In [20]:
total_params = sum(p.numel() for p in model.parameters()) 
print(f"Total number of parameters: {total_params:,}")


Total number of parameters: 163,009,536


In [31]:
def generate_text_simple(model, idx,                          
                         max_new_tokens, context_size):     
    for _ in range(max_new_tokens):         
        idx_cond = idx[:, -context_size:]         
        with torch.no_grad(): 
            logits = model(idx_cond)
        logits = logits[:, -1, :]         
        probas = torch.softmax(logits, dim=-1)         
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)        
        idx = torch.cat((idx, idx_next), dim=1)
    return idx


In [33]:
start_context = "Hello, I am" 
encoded = tokenizer.encode(start_context) 
print("encoded:", encoded) 
encoded_tensor = torch.tensor(encoded).unsqueeze(0) 
print("encoded_tensor.shape:", encoded_tensor.shape)

model.eval() 
out = generate_text_simple(    
    model=model, 
    idx=encoded_tensor,     
    max_new_tokens=6,     
    context_size=GPT_CONFIG_124M["context_length"] 
) 
print("Output:", out)
print("Output length:", len(out[0])) 

decoded_text = tokenizer.decode(out.squeeze(0).tolist())
print(decoded_text) 

encoded: [15496, 11, 314, 716]
encoded_tensor.shape: torch.Size([1, 4])
Output: tensor([[15496,    11,   314,   716, 12170, 44251, 25952, 19382, 17367, 33448]])
Output length: 10
Hello, I am drone Omni SSLvoidDark 2021
