# GPT-2 Model with OpenAI model's weights


In [175]:

import torch
import torch.nn as nn
torch.manual_seed(42)
import model_wrapper,models

import tiktoken
from transformers import GPT2LMHeadModel,GPT2Tokenizer


### Config

In [176]:
GPT_CONFIG = {
    "num_epochs":10,
    "batch_size":4,
    "vocab_size": 50257,    
    "context_len": 1024,  
    "emb_dim": 768,          
    "n_heads": 8,        
    "n_layers": 12,        
    "drop_rate": 0.1,  
    "initializer_range":0.02,   
    "qkv_bias": True  #GPT2 为True
}

LR= 1e-3
WEIGHT_DECAY =0.1
prompt ="the weather is hot"
# prompt ="It's really hot today, so I don't feel like going out."
max_len = 50
temperature = 0.8
top_k = 50

### Set device to (type='cuda')

In [177]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## 对比tiktoken和GPT2Tokenizer

In [178]:

def diff_tokenizer(txt,model_name):
    gpt_tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    gpt_ids = gpt_tokenizer.encode(txt, return_tensors="pt")
    print(f"gpt_ids:{gpt_ids}")
    ti_tokenizer = tiktoken.get_encoding(model_name)
    ti_ids = ti_tokenizer.encode(txt, allowed_special={'<|endoftext|>'})
    print(f"gpt_ids:{ti_ids}")
    #交叉解码验证
    output_gpt = gpt_tokenizer.decode(ti_ids, skip_special_tokens=True)
    output_ti = ti_tokenizer.decode(gpt_ids[0].squeeze(0).tolist())
    print(output_gpt)
    print(output_ti)
    
diff_tokenizer(prompt,'gpt2')

gpt_ids:tensor([[1169, 6193,  318, 3024]])
gpt_ids:[1169, 6193, 318, 3024]
the weather is hot
the weather is hot


### 验证tokenizer方法和官方的区别

In [179]:
def generate_text(prompt, model_name="gpt2", max_length=100, temperature=0.7,top_k=50):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    input_ids = tokenizer.encode(prompt, return_tensors="pt", add_special_tokens=True)
    
    eos_id = tokenizer.eos_token_id
    if input_ids[0, -1] != eos_id: #手动在末尾加eos
        input_ids = torch.cat([input_ids, torch.tensor([[eos_id]], dtype=input_ids.dtype)], dim=1)
    print(input_ids)
    
    ti_tokenizer = tiktoken.get_encoding(model_name)
    ti_ids = model_wrapper.texts_to_tokenIds(prompt,ti_tokenizer,max_length=None)
    print(ti_ids)
    
    print(f"encode is same:{input_ids==ti_ids}")
    
    model = GPT2LMHeadModel.from_pretrained(model_name)
    
    with torch.no_grad(): 
        output = model.generate(
            ti_ids,
            max_length=max_length,
            num_return_sequences=1,  
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            do_sample=True,
            temperature=temperature,  
            top_k=top_k,  
        )
    ti_gen_text = model_wrapper.tokenIds_to_texts(output[0],ti_tokenizer)
    gpt_gen_text = tokenizer.decode(output[0], skip_special_tokens=True)
    print(f"decode is same:{ti_gen_text==gpt_gen_text}")
    return gpt_gen_text,ti_gen_text

In [180]:

generated = generate_text(
        prompt=prompt,
        model_name="gpt2",  
        max_length=max_len,
        temperature=temperature,
        top_k =top_k
    )
generated

tensor([[ 1169,  6193,   318,  3024, 50256]])
tensor([[ 1169,  6193,   318,  3024, 50256]])
encode is same:tensor([[True, True, True, True, True]])
decode is same:True


("the weather is hotI've been in the business for about a year now, having spent time with people who are making music. I've watched people try to make music, and get paid to make it, which isn't an easy business to",
 "the weather is hotI've been in the business for about a year now, having spent time with people who are making music. I've watched people try to make music, and get paid to make it, which isn't an easy business to")

结论：tokenizer是一致的

## 查看官方GPT2模型架构

In [181]:
# 加载Hugging Face的官方GPT-2模型（包含OpenAI权重）
# https://huggingface.co/openai-community/gpt2
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py

gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2") 
official_state_dict = gpt2_model .state_dict()
# for name, param in official_state_dict.items():
#             print(f"{name}: {param.shape}")
gpt2_model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

## Load weights

复用GPT2的预训练weights值

In [182]:

def load_openai_weights(model,GPT_CONFIG,official_state_dict):
    custom_state_dict = model.state_dict()
    
    # 1. 映射嵌入层权重
    custom_state_dict["tok_emb.weight"] =  official_state_dict["transformer.wte.weight"].clone()
    custom_state_dict["pos_emb.weight"] = official_state_dict["transformer.wpe.weight"].clone()
    
    # 2. 映射Transformer层权重（每个block的参数）
    for i in range(GPT_CONFIG['n_layers']):
        # layer norm 1
        custom_state_dict[f"trf_blocks.{i}.norm1.scale"] = official_state_dict[f"transformer.h.{i}.ln_1.weight"].clone()
        custom_state_dict[f"trf_blocks.{i}.norm1.shift"] = official_state_dict[f"transformer.h.{i}.ln_1.bias"].clone()
        
        #attention 
        #拆分c_attn为W_q、W_k、W_v（c_attn.weight形状：[768, 2304] = [768, 768*3]）
        c_attn_weight = official_state_dict[f"transformer.h.{i}.attn.c_attn.weight"] #[768, 2304]
        w_q_weight, w_k_weight, w_v_weight = torch.split(c_attn_weight, 768, dim=1)
        custom_state_dict[f"trf_blocks.{i}.att.W_q.weight"] = w_q_weight.T.clone()
        custom_state_dict[f"trf_blocks.{i}.att.W_k.weight"] = w_k_weight.T.clone()
        custom_state_dict[f"trf_blocks.{i}.att.W_v.weight"] = w_v_weight.T.clone()
        c_attn_bias = official_state_dict[f"transformer.h.{i}.attn.c_attn.bias"] #[2304]
        w_q_bias, w_k_bias, w_v_bias = torch.split(c_attn_bias, 768, dim=0) 
        custom_state_dict[f"trf_blocks.{i}.att.W_q.bias"] = w_q_bias. clone()
        custom_state_dict[f"trf_blocks.{i}.att.W_k.bias"] = w_k_bias.clone()
        custom_state_dict[f"trf_blocks.{i}.att.W_v.bias"] = w_v_bias.clone()
        # out_proj  融合多头
        custom_state_dict[f"trf_blocks.{i}.att.c_proj.weight"] = official_state_dict[f"transformer.h.{i}.attn.c_proj.weight"].T.clone()
        custom_state_dict[f"trf_blocks.{i}.att.c_proj.bias"] = official_state_dict[f"transformer.h.{i}.attn.c_proj.bias"].clone()
        
        # layer norm 2
        custom_state_dict[f"trf_blocks.{i}.norm2.scale"] = official_state_dict[f"transformer.h.{i}.ln_2.weight"].clone()
        custom_state_dict[f"trf_blocks.{i}.norm2.shift"] = official_state_dict[f"transformer.h.{i}.ln_2.bias"].clone()
        
        # print(official_state_dict[f"transformer.h.{i}.mlp.c_fc.weight"].shape)
        # FFN 
        custom_state_dict[f"trf_blocks.{i}.ff.c_fc.weight"] = official_state_dict[f"transformer.h.{i}.mlp.c_fc.weight"].T.clone()  #转置目的是解决不同框架（TensorFlow → PyTorch）间线性层权重维度的定义差异
        custom_state_dict[f"trf_blocks.{i}.ff.c_fc.bias"] = official_state_dict[f"transformer.h.{i}.mlp.c_fc.bias"].clone()

        custom_state_dict[f"trf_blocks.{i}.ff.c_proj.weight"] = official_state_dict[f"transformer.h.{i}.mlp.c_proj.weight"].T.clone() #同理
        custom_state_dict[f"trf_blocks.{i}.ff.c_proj.bias"] = official_state_dict[f"transformer.h.{i}.mlp.c_proj.bias"].clone()

    
    # 3. 映射最终层归一化和输出层
    custom_state_dict["final_norm.scale"] = official_state_dict["transformer.ln_f.weight"].clone()
    custom_state_dict["final_norm.shift"] = official_state_dict["transformer.ln_f.bias"].clone()
    
    
    # 4. (out_head)与tok_emb共享权重，理论上无需额外映射 (对应gpt2的lm_head.weight: torch.Size([50257, 768]))
    # but 测试发现在load_state_dict并没有参数共享，手动赋值吧
    custom_state_dict["out_head.weight"] = official_state_dict["lm_head.weight"].clone()
    
    model.load_state_dict(custom_state_dict)
    # 测试lm_head层的值
    # print( model.out_head.weight.clone() == official_state_dict["transformer.wte.weight"].clone())
    return model

In [183]:

model = models.GPTModel(GPT_CONFIG)
load_openai_weights(model,GPT_CONFIG,official_state_dict)


GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (norm1): LayerNorm()
      (att): MultiHeadAttendtion_new(
        (W_q): Linear(in_features=768, out_features=768, bias=True)
        (W_k): Linear(in_features=768, out_features=768, bias=True)
        (W_v): Linear(in_features=768, out_features=768, bias=True)
        (c_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (norm2): LayerNorm()
      (ff): FeedForward(
        (c_fc): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU()
        (dropout): Dropout(p=0.1, inplace=False)
        (c_proj): Linear(in_features=3072, out_features=768, bias=True)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (norm1): LayerNorm()
      (att): MultiHeadAttendtion_new(
       

### 对比两个模型的文本输出

In [184]:
def gen_next_text_greedy(model,tokenizer,idxs,max_new_tokens,context_size,is_official=False):
    model.eval()
    logits_list =[]
    for i in range(max_new_tokens):
        idx_condition = idxs[:,-context_size:]
        with torch.no_grad():
            if is_official:
                official_outputs = model(idx_condition)
                logits = official_outputs.logits
            else:
                logits = model(idx_condition)
            
        #生成时：只需要最后一个位置的 logits
        logits = logits[:,-1,:]
        logits_list.append(logits)
        probas =torch.softmax(logits,dim=-1)
        idx_next = torch.argmax(probas,dim=-1,keepdim=True)
        idxs = torch.cat((idxs,idx_next),dim=1)
        print(f'{i}:{model_wrapper.tokenIds_to_texts(idxs,tokenizer)}')
    return  idxs,logits_list

def gen_next_word_greedy(idxs,logits):
    #生成时：只需要最后一个位置的 logits
    logits = logits[:,-1,:]
    # print(logits)
    probas =torch.softmax(logits,dim=-1)
    idx_next = torch.argmax(probas,dim=-1,keepdim=True)
    idxs = torch.cat((idxs,idx_next),dim=1)
    return idxs,logits

def diff_logits(o_logits,c_logits, atol=1e-3):
    for i  in range(len(o_logits)):
        if torch.allclose(o_logits[i], c_logits[i], atol=atol):
              print(f"✅ 模型输出在误差容限 {atol} 内完全一致")
        diff = torch.abs(o_logits[i] - c_logits[i])
        max_diff = diff.max().item()
        mean_diff = diff.mean().item()
        print(f"❌第{i}次")
        print(f"最大误差: {max_diff:.6f}")
        print(f"平均误差: {mean_diff:.6f}")
        
            
def compare_models(custom_model, official_model, tokenizer, prompt="Hello, world!", atol=1e-3):
    """
    对比自定义模型与官方GPT2模型的输出一致性
    """
    # 1. 准备输入
    input_ids = model_wrapper.texts_to_tokenIds(prompt,tokenizer,max_length=None)
    print(input_ids.shape)
    # 2. 统一设备（使用官方模型所在设备）
    device = next(official_model.parameters()).device
    input_ids = input_ids.to(device)
    custom_model = custom_model.to(device)  # 确保自定义模型在同一设备

    # 3. 切换为推理模式
    custom_model.eval()
    official_model.eval()
    
    # 4. 获取输出（关闭梯度计算）
    with torch.no_grad():
        # 自定义模型直接返回logits
        custom_logits = custom_model(input_ids)  
        # 官方模型从输出对象中提取logits
        official_outputs = official_model(input_ids)
        official_logits = official_outputs.logits
    official_out_ids,o_logits = gen_next_word_greedy(input_ids,official_logits)
    official_text = model_wrapper.tokenIds_to_texts(official_out_ids,tokenizer)
    custom_out_ids,c_logits = gen_next_word_greedy(input_ids,custom_logits)
    custom_out_text = model_wrapper.tokenIds_to_texts(custom_out_ids,tokenizer)
    
    print("text same:",official_text==custom_out_text)
    diff_logits(o_logits,c_logits)
    print(official_text,custom_out_text)
    
    official_out_ids,o_logits = gen_next_text_greedy(official_model,tokenizer,input_ids,20,256,is_official=True)
    custom_out_ids,c_logits = gen_next_text_greedy(custom_model,tokenizer,input_ids,20,256)
    
    diff_logits(o_logits,c_logits)

In [185]:
ti_tokenizer = tiktoken.get_encoding('gpt2')
compare_models(model,gpt2_model,ti_tokenizer,prompt=prompt)

torch.Size([1, 5])
text same: True
❌第0次
最大误差: 2.757355
平均误差: 1.830840
['the weather is hotThe'] ['the weather is hotThe']
0:['the weather is hotThe']
1:['the weather is hotThe first']
2:['the weather is hotThe first time']
3:['the weather is hotThe first time I']
4:['the weather is hotThe first time I saw']
5:['the weather is hotThe first time I saw the']
6:['the weather is hotThe first time I saw the new']
7:['the weather is hotThe first time I saw the new "']
8:['the weather is hotThe first time I saw the new "The']
9:['the weather is hotThe first time I saw the new "The Walking']
10:['the weather is hotThe first time I saw the new "The Walking Dead']
11:['the weather is hotThe first time I saw the new "The Walking Dead"']
12:['the weather is hotThe first time I saw the new "The Walking Dead" trailer']
13:['the weather is hotThe first time I saw the new "The Walking Dead" trailer,']
14:['the weather is hotThe first time I saw the new "The Walking Dead" trailer, I']
15:['the weather i

结论：load weights匹配模型和原有模型生成的logits会有误差。因为是自回归模型，logits的误差会累积，测试不同的prompt发现，某些prompt在两个模型下生成的文本一致，但是如果模型某一步导致误差较大，越往后生成，文本偏差越大

### Save model

In [186]:
modelpath ='../model/gpt2_weight.pt'
model_wrapper.savemodel(modelpath,model,None,GPT_CONFIG)
