# GPT-2 Model with OPENAI model's weights


In [None]:

import torch
import torch.nn as nn
torch.manual_seed(42)
import tool ,model_wrapper
from tqdm import tqdm
import tiktoken
from transformers import GPT2LMHeadModel

### Config

In [None]:
IS_SKIP_TEST =True
IS_EN =True
IS_TRAIN=True


GPT_CONFIG = {
    "num_epochs":10,
    "batch_size":4,
    "vocab_size": 50257,     # 词汇表大小
    "context_len": 512,  # 上下文长度
    "emb_dim": 768,          # 嵌入维度
    "n_heads": 8,           # 注意力头的数量
    "n_layers": 12,          # 层数
    "drop_rate": 0.1,        # dropout率
    "qkv_bias": False ,      # 查询-键-值偏置
}

TOKEN_TYPE="gpt2"
# TOKEN_TYPE="cl100k_base"
LR= 1e-3
WEIGHT_DECAY =0.1


### Set device to (type='cuda')

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

## Define GPT-2 Model

In [None]:
class GPTModel(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg['vocab_size'],cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['context_len'],cfg['emb_dim'])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        self.trf_blocks =  nn.Sequential(
            *[model_wrapper.TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )
        self.final_norm = model_wrapper.LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg['emb_dim'],cfg['vocab_size'],bias=False
        )
       
    def forward(self,in_idx):
        batch_size, seq_len = in_idx.shape  
        tok_embeds = self.tok_emb(in_idx) 
        pos_embeds = self.pos_emb(torch.arange(seq_len,device=in_idx.device))  
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits
        

## Load weights

In [None]:
def load_openai_weights(model, model_size="small"):
    # 加载Hugging Face的官方GPT-2模型（包含OpenAI权重）
    official_model = GPT2LMHeadModel.from_pretrained(f"gpt2-{model_size}")
    official_state_dict = official_model.state_dict()
    
    # 自定义模型的状态字典
    custom_state_dict = model.state_dict()
    
    # 1. 映射嵌入层权重
    custom_state_dict["wte.weight"] = official_state_dict["transformer.wte.weight"]
    custom_state_dict["wpe.weight"] = official_state_dict["transformer.wpe.weight"]
    
    # 2. 映射Transformer层权重（每个block的参数）
    for i in range(model.config.n_layer):
        # 自注意力层
        custom_state_dict[f"h.{i}.attn.c_attn.weight"] = official_state_dict[f"transformer.h.{i}.attn.c_attn.weight"]
        custom_state_dict[f"h.{i}.attn.c_attn.bias"] = official_state_dict[f"transformer.h.{i}.attn.c_attn.bias"]
        custom_state_dict[f"h.{i}.attn.c_proj.weight"] = official_state_dict[f"transformer.h.{i}.attn.c_proj.weight"]
        custom_state_dict[f"h.{i}.attn.c_proj.bias"] = official_state_dict[f"transformer.h.{i}.attn.c_proj.bias"]
        custom_state_dict[f"h.{i}.attn.ln_1.weight"] = official_state_dict[f"transformer.h.{i}.ln_1.weight"]
        custom_state_dict[f"h.{i}.attn.ln_1.bias"] = official_state_dict[f"transformer.h.{i}.ln_1.bias"]
        
        # 前馈网络层
        custom_state_dict[f"h.{i}.mlp.c_fc.weight"] = official_state_dict[f"transformer.h.{i}.mlp.c_fc.weight"]
        custom_state_dict[f"h.{i}.mlp.c_fc.bias"] = official_state_dict[f"transformer.h.{i}.mlp.c_fc.bias"]
        custom_state_dict[f"h.{i}.mlp.c_proj.weight"] = official_state_dict[f"transformer.h.{i}.mlp.c_proj.weight"]
        custom_state_dict[f"h.{i}.mlp.c_proj.bias"] = official_state_dict[f"transformer.h.{i}.mlp.c_proj.bias"]
        custom_state_dict[f"h.{i}.mlp.ln_2.weight"] = official_state_dict[f"transformer.h.{i}.ln_2.weight"]
        custom_state_dict[f"h.{i}.mlp.ln_2.bias"] = official_state_dict[f"transformer.h.{i}.ln_2.bias"]
    
    # 3. 映射最终层归一化和输出层
    custom_state_dict["ln_f.weight"] = official_state_dict["transformer.ln_f.weight"]
    custom_state_dict["ln_f.bias"] = official_state_dict["transformer.ln_f.bias"]
    # 注意：lm_head与wte共享权重，无需额外映射
    
    # 加载映射后的权重到自定义模型
    
    model.load_state_dict(custom_state_dict)
    return model

In [None]:
model = GPTModel(GPT_CONFIG)
load_openai_weights(model)
model.to(device)
model.eval() 
tokenizer = tiktoken.get_encoding(TOKEN_TYPE)
start_context = 'this is test'  
for _ in range(5):
    model_wrapper.generate_and_print(model,tokenizer,device,start_context,100)

## Save and load the parameters of the model
### Save model

In [None]:
# modelpath ='../model/model_and_optimizer.pth'
# model_wrapper.savemodel(modelpath,model,optimizer)
