# GPT-2 Model


In [1]:

import torch
import torch.nn as nn
from torch.testing import assert_close
torch.manual_seed(42)
import tool, loaddata ,model_wrapper
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
import tiktoken

  from .autonotebook import tqdm as notebook_tqdm


函数 test_dummyModel 已跳过执行
函数 test_layer_norm 已跳过执行
函数 test_gelu 已跳过执行
函数 test_tokenizer 已跳过执行
函数 test_loss 已跳过执行


### Config

In [2]:
IS_SKIP_TEST =True
IS_EN =True
IS_TRAIN=True


GPT_CONFIG = {
    "num_epochs":10,
    "batch_size":4,
    "vocab_size": 50257,     # 词汇表大小
    "context_len": 256,  # 上下文长度
    "emb_dim": 768,          # 嵌入维度
    "n_heads": 8,           # 注意力头的数量
    "n_layers": 12,          # 层数
    "drop_rate": 0.1,        # dropout率
    "qkv_bias": False ,      # 查询-键-值偏置
}

TOKEN_TYPE="gpt2"
# TOKEN_TYPE="cl100k_base"
LR= 1e-3
WEIGHT_DECAY =0.1

EVAL_FREQ = 5
EVAL_ITER = 5


In [3]:
! nvidia-smi

Wed Aug 13 14:16:28 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.94                 Driver Version: 560.94         CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce GTX 960       WDDM  |   00000000:01:00.0  On |                  N/A |
| 30%   51C    P8             18W /  180W |    1209MiB /   4096MiB |     17%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

### Set device to (type='cuda')

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

### Tensorboard Log

In [5]:
# 创建一个日志写入器
import time
starttime = time.strftime("%Y-%m-%d_%H-%M-%S")
print("Start experiment:", starttime)
log_writer = SummaryWriter(log_dir="../log/"+starttime[:13],comment=starttime[:13],flush_secs=60)#以实验时间命名，[:13]可以自定义，我是定义到小时基本能确定是哪个实验了


Start experiment: 2025-08-13_14-16-28


## Define GPT-2 Model

In [6]:
class GPTModel(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg['vocab_size'],cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['context_len'],cfg['emb_dim'])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        self.trf_blocks =  nn.Sequential(
            *[model_wrapper.TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )
        self.final_norm = model_wrapper.LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg['emb_dim'],cfg['vocab_size'],bias=False
        )
       
    def forward(self,in_idx):
        batch_size, seq_len = in_idx.shape  #in_idx 通常是一个整数张量（Tensor），形状一般为 (batch_size, seq_len)
        tok_embeds = self.tok_emb(in_idx) #(batch_size, seq_len, emb_dim)
        pos_embeds = self.pos_emb(torch.arange(seq_len,device=in_idx.device))  #生成一个从 0 到 seq_len-1 的整数序列
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits
        

### View structure of model 

In [7]:
#GPT2 小型（Small）：12 层 Transformer 解码器，隐藏层维度 768，注意力头数 12，总参数约 1.2 亿
@tool.skip_execution(skip=IS_SKIP_TEST)
def test_GPT2_model():
    CONFIG = {
    "num_epochs":1,
    "batch_size":1,
    "vocab_size": 50257,     
    "context_len": 512,  
    "emb_dim": 768,          
    "n_heads": 8,          
    "n_layers": 12,          
    "drop_rate": 0.1,       
    "qkv_bias": False ,      
    }   
    model = GPTModel(CONFIG)
    model.to(device)

    # attention_new 参数减少量 = (304,556,544 - 163,008,000)
    total_params =sum(p.numel() for p in model.parameters())

    print(f"Total number of parameters: {total_params:,}") #163,008,000

    #权重共享， W_emb和W_out指向同一块内存，模型训练时只会更新这一个矩阵，避免了维护两个独立矩阵的开销
    total_params_gpt2 = total_params - sum(p.numel()for p in model.out_head.parameters())
   
    print(f"Number of trainable parameters "
        f"considering weight tying: {total_params_gpt2:,}") #124,017,408
    return model
    
test_GPT2_model()

函数 test_GPT2_model 已跳过执行


## Load Data

### Select which language of the text to load

In [8]:
if IS_EN :
    file_path ="../datasets/the-verdict.txt"
    train_data, valid_data =loaddata.load_data_en(file_path)
else:
    train_data,valid_data = loaddata.load_data_cn(True,0.8)
    
len(train_data)

total char: 20479
train char: 16383
valid char: 4096 



1

## GPTDataLoader

In [9]:

train_loader = model_wrapper.GPTDataloader(
    train_data,
    TOKEN_TYPE,
    batch_size = GPT_CONFIG['batch_size'],
    max_len = GPT_CONFIG["context_len"],
    stride = GPT_CONFIG["context_len"] // 2, #适度重叠（stride = max_len // 2）
    drop_last=True,
    shuffle= True, #训练时打乱，验证 / 测试时不打乱：训练时打乱是为了提升泛化能力
    num_works=0   
    )

print(F'共{len(train_loader)}个批次，'
      f'每批{train_loader.batch_size}个样本，'
      f'每个样本是长度为 {train_loader.dataset.max_len} 的 token 序列')


print("Train loader:")
if len(train_loader)<10: #测试的时候查看一下数据，数据过大，耗时很长
    x, y =next(iter(train_loader))
    print(x.shape, y.shape)


valid_loader = model_wrapper.GPTDataloader(
    valid_data,
    TOKEN_TYPE,
    batch_size = GPT_CONFIG['batch_size'],
    max_len = GPT_CONFIG["context_len"],
    stride = GPT_CONFIG["context_len"] ,
    drop_last=False, # 验证 / 测试阶段：需要完整评估所有样本的性能，不能遗漏任何数据点
    shuffle= False, #验证 / 测试时不打乱是为了结果可复现，方便对比不同模型的性能
    num_works=0
    )

print("Validation loader:")
if len(valid_loader)<10:
    x, y =next(iter(valid_loader))
    print(x.shape, y.shape)


Process text: 100%|██████████| 1/1 [00:00<00:00, 111.10it/s]


Total samples: 31
共7个批次，每批4个样本，每个样本是长度为 256 的 token 序列
Train loader:
torch.Size([4, 256]) torch.Size([4, 256])


Process text: 100%|██████████| 1/1 [00:00<00:00, 500.51it/s]

Total samples: 4
Validation loader:
torch.Size([4, 256]) torch.Size([4, 256])





## Training model
### Training function

In [10]:
@tool.train_execution(IS_TRAIN)
def train_model_process(model,train_loader,valid_loader,
                        optimizer,device,num_epochs,
                        eval_freq,eval_iter,
                        start_context,tokenizer):
    train_losses,val_losses= [],[]
    track_tokens_seen =[]
    tokens_seen,global_step = 0, -1
    for epoch in  tqdm(range(num_epochs), desc="training"):
        model.train()
        for input_batch,target_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}", leave=False):
            optimizer.zero_grad()
            loss = model_wrapper.calc_loss_batch(input_batch,target_batch,model,device)
            loss.backward()
            optimizer.step()
            tokens_seen += input_batch.numel()
            global_step +=1
            
            if global_step % eval_freq == 0:
                train_loss, val_loss = model_wrapper.evaluate_model(model,train_loader,valid_loader,device,eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(track_tokens_seen)
                log_writer.add_scalar('Training/Train_Loss', train_loss, global_step)
                log_writer.add_scalar('Training/Val_Loss', val_loss, global_step)
                print(f'EP: {epoch+1} STEP: {global_step} '
                    f'{f"T_LOSS: {train_loss:.3f}" if train_loss is not None else "T_LOSS: None "} '
                    f'{f"V_LOSS: {val_loss:.3f}" if val_loss is not None else "V_LOSS: None"}'
                    )
                model_wrapper.generate_and_print(model,tokenizer,device,start_context,20)
    return train_losses,val_losses,track_tokens_seen

In [11]:
# ! tensorboard --logdir=..\log

In [12]:
model =GPTModel(GPT_CONFIG)
model.to(device)
start_context = "I turned to Mrs"
optimizer = torch.optim.AdamW(model.parameters(),lr=LR,weight_decay=WEIGHT_DECAY)
tokenizer = tiktoken.get_encoding(TOKEN_TYPE)
result=train_model_process(model,train_loader,valid_loader,
                                                               optimizer,device,num_epochs=GPT_CONFIG['num_epochs'],
                                                               eval_freq=EVAL_FREQ,eval_iter=EVAL_ITER,
                                                               start_context=start_context,
                                                               tokenizer=tokenizer
                                                               )
if result:
    train_losses,val_losses,track_tokens_seen = result

training:   0%|          | 0/10 [00:00<?, ?it/s]

EP: 1 STEP: 0 T_LOSS: 9.441 V_LOSS: 9.808




I turned to Mrs,,,,,,,,,,,,,,,,,, the,




EP: 1 STEP: 5 T_LOSS: 6.566 V_LOSS: 7.109




I turned to Mrs                    


training:  10%|█         | 1/10 [00:26<03:55, 26.15s/it]

EP: 2 STEP: 10 T_LOSS: 5.890 V_LOSS: 7.156




I turned to Mrs.                   


training:  20%|██        | 2/10 [00:54<03:38, 27.32s/it]

EP: 3 STEP: 15 T_LOSS: 5.849 V_LOSS: 7.241




I turned to Mrs." the the the the the the the-- the the the the the the the the the the




EP: 3 STEP: 20 T_LOSS: 5.743 V_LOSS: 7.275


training:  30%|███       | 3/10 [01:26<03:27, 29.70s/it]

I turned to Mrs.                   




EP: 4 STEP: 25 T_LOSS: 5.641 V_LOSS: 7.253




I turned to Mrs. G, the, and, and, and, and, and, and, and, and


training:  40%|████      | 4/10 [01:55<02:56, 29.49s/it]

EP: 5 STEP: 30 T_LOSS: 5.655 V_LOSS: 7.591




I turned to Mrs the, the the                


training:  50%|█████     | 5/10 [02:24<02:25, 29.04s/it]

EP: 6 STEP: 35 T_LOSS: 5.495 V_LOSS: 7.422




I turned to Mrs the the the the the the the, and the the.        




EP: 6 STEP: 40 T_LOSS: 5.007 V_LOSS: 7.250




I turned to Mrs.  the a the picture, and he was a         


training:  60%|██████    | 6/10 [02:56<01:59, 29.97s/it]

EP: 7 STEP: 45 T_LOSS: 5.242 V_LOSS: 7.417




I turned to Mrs the a a a to the a to the a a to the a little the a--and the


training:  70%|███████   | 7/10 [03:22<01:26, 28.72s/it]

EP: 8 STEP: 50 T_LOSS: 4.615 V_LOSS: 7.212




I turned to Mrs. "--the. "--the. ". "--the. 




EP: 8 STEP: 55 T_LOSS: 3.959 V_LOSS: 6.939


training:  80%|████████  | 8/10 [03:53<00:59, 29.61s/it]

I turned to Mrs. "Oh, and he had been the fact and I had been the picture. Gis




EP: 9 STEP: 60 T_LOSS: 3.186 V_LOSS: 6.899




I turned to Mrs. I felt to see the house in the house, and I had been the house in the house


training:  90%|█████████ | 9/10 [04:19<00:28, 28.52s/it]

EP: 10 STEP: 65 T_LOSS: 2.624 V_LOSS: 6.927




I turned to Mrs. Gisburn, so often, in him, so that, with him, I was not


training: 100%|██████████| 10/10 [04:48<00:00, 28.86s/it]


In [13]:
log_writer.close()

## Save and Load  state of model
### Save model

In [14]:
modelpath ='..\model\model_and_optimizer.pth'
model_wrapper.savemodel(modelpath,model,optimizer)


### Load model

In [15]:
checkpoint = torch.load(modelpath, map_location=device)
model = GPTModel(GPT_CONFIG)
model.load_state_dict(checkpoint["model_state_dict"])
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=0.1)
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
model.to(device)
model.eval() 

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttendtion_new(
        (W_q): Linear(in_features=768, out_features=768, bias=False)
        (W_k): Linear(in_features=768, out_features=768, bias=False)
        (W_v): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Dropout(p=0.1, inplace=False)
          (3): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttendtion_new

### Test the generation capability of the saved model

In [16]:
for _ in range(5):
    model_wrapper.generate_and_print(model,tokenizer,device,start_context,100)

I turned to Mrs. Gisburn's the picture--and he had been the fact with a deprecating a failure, and str? No--for, and threw back his wife, and he had again, and left behind.                                                        
I turned to Mrs. Gisburn's the picture--and he had been the fact with a deprecating a failure, and str? No--for, and threw back his wife, and he had again, and left behind.                                                        
I turned to Mrs. Gisburn's the picture--and he had been the fact with a deprecating a failure, and str? No--for, and threw back his wife, and he had again, and left behind.                                                        
I turned to Mrs. Gisburn's the picture--and he had been the fact with a deprecating a failure, and str? No--for, and threw back his wife, and he had again, and left behind.                                                        
I turned to Mrs. Gisburn's the picture--and he had been the fact with a deprecating 