# GPT-2 Model with OPENAI model's weights


In [None]:

import torch
import torch.nn as nn
torch.manual_seed(42)
import model_wrapper,models

import tiktoken
from transformers import GPT2LMHeadModel


### Config

In [None]:
GPT_CONFIG = {
    "num_epochs":10,
    "batch_size":4,
    "vocab_size": 50257,    
    "context_len": 1024,  
    "emb_dim": 768,          
    "n_heads": 8,        
    "n_layers": 12,        
    "drop_rate": 0.1,     
    "qkv_bias": True  #GPT2 为True
}

LR= 1e-3
WEIGHT_DECAY =0.1


### Set device to (type='cuda')

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
# 加载Hugging Face的官方GPT-2模型（包含OpenAI权重）
# https://huggingface.co/openai-community/gpt2

official_state_dict = GPT2LMHeadModel.from_pretrained("gpt2") .state_dict()
for name, param in official_state_dict.items():
            print(f"{name}: {param.shape}")

## Load weights

In [None]:


def load_openai_weights(model,GPT_CONFIG,official_state_dict):
    custom_state_dict = model.state_dict()
    
    # 1. 映射嵌入层权重
    custom_state_dict["tok_emb.weight"] =  official_state_dict["transformer.wte.weight"].clone()
    custom_state_dict["pos_emb.weight"] = official_state_dict["transformer.wpe.weight"].clone()
    
    # 2. 映射Transformer层权重（每个block的参数）
    for i in range(GPT_CONFIG['n_layers']):
        # layer norm 1
        custom_state_dict[f"trf_blocks.{i}.norm1.scale"] = official_state_dict[f"transformer.h.{i}.ln_1.weight"].clone()
        custom_state_dict[f"trf_blocks.{i}.norm1.shift"] = official_state_dict[f"transformer.h.{i}.ln_1.bias"].clone()
        
        #attention 
        #拆分c_attn为W_q、W_k、W_v（c_attn.weight形状：[768, 2304] = [768, 768*3]）
        c_attn_weight = official_state_dict[f"transformer.h.{i}.attn.c_attn.weight"] #[768, 2304]
        w_q_weight, w_k_weight, w_v_weight = torch.split(c_attn_weight, 768, dim=1)
        custom_state_dict[f"trf_blocks.{i}.att.W_q.weight"] = w_q_weight.clone()
        custom_state_dict[f"trf_blocks.{i}.att.W_k.weight"] = w_k_weight.clone()
        custom_state_dict[f"trf_blocks.{i}.att.W_v.weight"] = w_v_weight.clone()
        c_attn_bias = official_state_dict[f"transformer.h.{i}.attn.c_attn.bias"] #[2304]
        w_q_bias, w_k_bias, w_v_bias = torch.split(c_attn_bias, 768, dim=0) 
        custom_state_dict[f"trf_blocks.{i}.att.W_q.bias"] = w_q_bias.clone()
        custom_state_dict[f"trf_blocks.{i}.att.W_k.bias"] = w_k_bias.clone()
        custom_state_dict[f"trf_blocks.{i}.att.W_v.bias"] = w_v_bias.clone()
        # out_proj  融合多头
        custom_state_dict[f"trf_blocks.{i}.att.out_proj.weight"] = official_state_dict[f"transformer.h.{i}.attn.c_proj.weight"].clone()
        custom_state_dict[f"trf_blocks.{i}.att.out_proj.bias"] = official_state_dict[f"transformer.h.{i}.attn.c_proj.bias"].clone()
        
        # layer norm 2
        custom_state_dict[f"trf_blocks.{i}.norm2.scale"] = official_state_dict[f"transformer.h.{i}.ln_2.weight"].clone()
        custom_state_dict[f"trf_blocks.{i}.norm2.shift"] = official_state_dict[f"transformer.h.{i}.ln_2.bias"].clone()
        
        
        # FFN 
        custom_state_dict[f"trf_blocks.{i}.ff.layers.0.weight"] = official_state_dict[f"transformer.h.{i}.mlp.c_fc.weight"].T.clone()  #转置目的是解决不同框架（TensorFlow → PyTorch）间线性层权重维度的定义差异
        custom_state_dict[f"trf_blocks.{i}.ff.layers.0.bias"] = official_state_dict[f"transformer.h.{i}.mlp.c_fc.bias"].clone()

        custom_state_dict[f"trf_blocks.{i}.ff.layers.3.weight"] = official_state_dict[f"transformer.h.{i}.mlp.c_proj.weight"].T.clone() #同理
        custom_state_dict[f"trf_blocks.{i}.ff.layers.3.bias"] = official_state_dict[f"transformer.h.{i}.mlp.c_proj.bias"].clone()

    
    # 3. 映射最终层归一化和输出层
    custom_state_dict["final_norm.scale"] = official_state_dict["transformer.ln_f.weight"].clone()
    custom_state_dict["final_norm.shift"] = official_state_dict["transformer.ln_f.bias"].clone()
    
    # (out_head)与tok_emb共享权重，无需额外映射 (对应gpt2的lm_head.weight: torch.Size([50257, 768]))
    
    
    # for name, param in custom_state_dict.items():
    #        print(f"{name}: {param.shape}")
    
    model.load_state_dict(custom_state_dict)
    return model

In [None]:
model = models.GPTModel(GPT_CONFIG)
load_openai_weights(model,GPT_CONFIG,official_state_dict)
model.to(device)
model.eval() 
tokenizer = tiktoken.get_encoding('gpt2')
start_context = "I turned to Mrs"
for _ in range(5):
    model_wrapper.generate_and_print(model,tokenizer,device,start_context,100,0.8,50,0.95,50256)

### Save model

In [None]:
modelpath ='../model/gpt2_weight.pt'
model_wrapper.savemodel(modelpath,model,None,GPT_CONFIG)
