# GPT-2 Model


In [None]:
import sys
import os


try:
    get_ipython
    current_dir = os.getcwd()
except NameError:
    current_dir = os.path.dirname(os.path.abspath(__file__))

# Set path，temporary path expansion
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
if parent_dir not in sys.path:
    sys.path.append(parent_dir)
    
import torch
import torch.nn as nn 
from src import tool ,model_wrapper

函数 test_dummyModel 已跳过执行
函数 test_layer_norm 已跳过执行
函数 test_gelu 已跳过执行
函数 test_tokenizer 已跳过执行
函数 test_loss 已跳过执行


### Config

In [None]:
IS_SKIP_TEST =True

## Define GPT-2 Model

In [5]:
class GPTModel(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg['vocab_size'],cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['context_len'],cfg['emb_dim'])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        self.trf_blocks =  nn.Sequential(
            *[model_wrapper.TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )
        self.final_norm = model_wrapper.LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg['emb_dim'],cfg['vocab_size'],bias=False
        )
       
    def forward(self,in_idx):
        batch_size, seq_len = in_idx.shape  
        tok_embeds = self.tok_emb(in_idx) 
        pos_embeds = self.pos_emb(torch.arange(seq_len,device=in_idx.device))  
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits
        

### View structure of model 

In [None]:
#GPT2 小型（Small）：12 层 Transformer 解码器，隐藏层维度 768，注意力头数 12，总参数约 1.2 亿
@tool.skip_execution(skip=IS_SKIP_TEST)
def test_GPT2_model():
    CONFIG = {
    "num_epochs":1,
    "batch_size":1,
    "vocab_size": 50257,     
    "context_len": 512,  
    "emb_dim": 768,          
    "n_heads": 8,          
    "n_layers": 12,          
    "drop_rate": 0.1,       
    "qkv_bias": False ,      
    }   
    model = GPTModel(CONFIG)
 

    # multi attention_new 参数减少量 = (304,556,544 - 163,008,000)
    total_params =sum(p.numel() for p in model.parameters())

    print(f"Total number of parameters: {total_params:,}") #163,008,000

    #权重共享， W_emb和W_out指向同一块内存，模型训练时只会更新这一个矩阵，避免了维护两个独立矩阵的开销
    total_params_gpt2 = total_params - sum(p.numel()for p in model.out_head.parameters())
   
    print(f"Number of trainable parameters "
        f"considering weight tying: {total_params_gpt2:,}") #124,017,408
    return model
    
test_GPT2_model()

函数 test_GPT2_model 已跳过执行
