# GPT-2 Model


In [16]:

import torch
import torch.nn as nn
from torch.testing import assert_close
torch.manual_seed(42)
import tool, loaddata ,model_wrapper
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
import tiktoken

### Config

In [17]:
IS_SKIP_TEST =True
IS_EN =True
IS_TRAIN=True


GPT_CONFIG = {
    "num_epochs":10,
    "batch_size":4,
    "vocab_size": 50257,     # 词汇表大小
    "context_len": 256,  # 上下文长度
    "emb_dim": 768,          # 嵌入维度
    "n_heads": 8,           # 注意力头的数量
    "n_layers": 12,          # 层数
    "drop_rate": 0.1,        # dropout率
    "qkv_bias": False ,      # 查询-键-值偏置
}

TOKEN_TYPE="gpt2"
# TOKEN_TYPE="cl100k_base"
LR= 1e-3
WEIGHT_DECAY =0.1

EVAL_FREQ = 5
EVAL_ITER = 5


In [18]:
! nvidia-smi

Wed Aug 13 18:20:29 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.94                 Driver Version: 560.94         CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce GTX 960       WDDM  |   00000000:01:00.0  On |                  N/A |
| 30%   47C    P8             18W /  180W |    2796MiB /   4096MiB |     16%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

### Set device to (type='cuda')

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

### Tensorboard Log

In [20]:
# 创建一个日志写入器
import time
starttime = time.strftime("%Y-%m-%d_%H-%M-%S")
print("Start experiment:", starttime)
logpath ="../log/"
log_writer = SummaryWriter(log_dir=logpath+starttime[:16],comment=starttime[:16],flush_secs=60)#以实验时间命名


Start experiment: 2025-08-13_18-20-30


## Define GPT-2 Model

In [21]:
class GPTModel(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg['vocab_size'],cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['context_len'],cfg['emb_dim'])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])
        self.trf_blocks =  nn.Sequential(
            *[model_wrapper.TransformerBlock(cfg) for _ in range(cfg["n_layers"])]
        )
        self.final_norm = model_wrapper.LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg['emb_dim'],cfg['vocab_size'],bias=False
        )
       
    def forward(self,in_idx):
        batch_size, seq_len = in_idx.shape  
        tok_embeds = self.tok_emb(in_idx) 
        pos_embeds = self.pos_emb(torch.arange(seq_len,device=in_idx.device))  
        x = tok_embeds + pos_embeds
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits
        

### View structure of model 

In [22]:
#GPT2 小型（Small）：12 层 Transformer 解码器，隐藏层维度 768，注意力头数 12，总参数约 1.2 亿
@tool.skip_execution(skip=IS_SKIP_TEST)
def test_GPT2_model():
    CONFIG = {
    "num_epochs":1,
    "batch_size":1,
    "vocab_size": 50257,     
    "context_len": 512,  
    "emb_dim": 768,          
    "n_heads": 8,          
    "n_layers": 12,          
    "drop_rate": 0.1,       
    "qkv_bias": False ,      
    }   
    model = GPTModel(CONFIG)
    model.to(device)

    # attention_new 参数减少量 = (304,556,544 - 163,008,000)
    total_params =sum(p.numel() for p in model.parameters())

    print(f"Total number of parameters: {total_params:,}") #163,008,000

    #权重共享， W_emb和W_out指向同一块内存，模型训练时只会更新这一个矩阵，避免了维护两个独立矩阵的开销
    total_params_gpt2 = total_params - sum(p.numel()for p in model.out_head.parameters())
   
    print(f"Number of trainable parameters "
        f"considering weight tying: {total_params_gpt2:,}") #124,017,408
    return model
    
test_GPT2_model()

函数 test_GPT2_model 已跳过执行


## Load Data

### Select which language of the text to load

In [23]:
if IS_EN :
    file_path ="../datasets/the-verdict.txt"
    train_data, valid_data =loaddata.load_data_en(file_path)
else:
    train_data, valid_data = loaddata.load_data_cn(True,0.8)
    
len(train_data)

total char: 20479
train char: 16383
valid char: 4096 



1

## GPTDataLoader

In [24]:
@tool.train_execution(IS_TRAIN)
def Dataloadering():
    train_loader = model_wrapper.GPTDataloader(
        train_data,
        TOKEN_TYPE,
        batch_size = GPT_CONFIG['batch_size'],
        max_len = GPT_CONFIG["context_len"],
        stride = GPT_CONFIG["context_len"] // 2, #适度重叠（stride = max_len // 2）
        drop_last=True,
        shuffle= True, #训练时打乱，验证 / 测试时不打乱：训练时打乱是为了提升泛化能力
        num_works=0   
        )

    print(F'共{len(train_loader)}个批次，'
        f'每批{train_loader.batch_size}个样本，'
        f'每个样本是长度为 {train_loader.dataset.max_len} 的 token 序列')


    print("Train loader:")
    if len(train_loader)<10: #测试的时候查看一下数据，数据过大，耗时很长
        x, y =next(iter(train_loader))
        print(x.shape, y.shape)


    valid_loader = model_wrapper.GPTDataloader(
        valid_data,
        TOKEN_TYPE,
        batch_size = GPT_CONFIG['batch_size'],
        max_len = GPT_CONFIG["context_len"],
        stride = GPT_CONFIG["context_len"] ,
        drop_last=False, # 验证 / 测试阶段：需要完整评估所有样本的性能，不能遗漏任何数据点
        shuffle= False, #验证 / 测试时不打乱是为了结果可复现，方便对比不同模型的性能
        num_works=0
        )

    print("Validation loader:")
    if len(valid_loader)<10:
        x, y =next(iter(valid_loader))
        print(x.shape, y.shape)
        
    return train_loader,valid_loader

## Training model
### Training process function

In [25]:

def train_model_process(model,train_loader,valid_loader,
                        optimizer,device,num_epochs,
                        eval_freq,eval_iter,
                        start_context,tokenizer):
    train_losses,val_losses= [],[]
    track_tokens_seen =[]
    tokens_seen,global_step = 0, -1
    for epoch in  tqdm(range(num_epochs), desc="training"):
        model.train() #开启训练 启用dropout、启用Batchnorm
        for input_batch,target_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}", leave=False):
            optimizer.zero_grad()
            loss = model_wrapper.calc_loss_batch(input_batch,target_batch,model,device) 
            loss.backward()
            optimizer.step()
            tokens_seen += input_batch.numel()
            global_step +=1
            
            if global_step % eval_freq == 0:
                train_loss, val_loss = model_wrapper.evaluate_model(model,train_loader,valid_loader,device,eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                log_writer.add_scalar('Training/Train_Loss', train_loss, global_step)
                log_writer.add_scalar('Training/Val_Loss', val_loss, global_step)
                print(f'EP: {epoch+1} STEP: {global_step} '
                    f'{f"T_LOSS: {train_loss:.3f}" if train_loss is not None else "T_LOSS: None "} '
                    f'{f"V_LOSS: {val_loss:.3f}" if val_loss is not None else "V_LOSS: None"}'
                    )
                model_wrapper.generate_and_print(model,tokenizer,device,start_context,20)
    return train_losses,val_losses,track_tokens_seen

In [26]:
# ! tensorboard --logdir=../log  --port=6006 --host=0.0.0.0  
# ssh -L 8888:localhost:6006 zzz@172.23.207.112    

In [27]:
modelpath ='../model/model_and_optimizer.pth'
start_context = "I turned to Mrs"
tokenizer = tiktoken.get_encoding(TOKEN_TYPE)


@tool.train_execution(IS_TRAIN)
def training(savemodel=False):
    model = GPTModel(GPT_CONFIG)
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(),lr=LR,weight_decay=WEIGHT_DECAY)
    train_loader,valid_loader = Dataloadering()
    result=train_model_process(model,train_loader,valid_loader,
                                                                optimizer,device,num_epochs=GPT_CONFIG['num_epochs'],
                                                                eval_freq=EVAL_FREQ,eval_iter=EVAL_ITER,
                                                                start_context=start_context,
                                                                tokenizer=tokenizer
                                                                )
    if savemodel:
        model_wrapper.savemodel(modelpath,model,optimizer)  
        
    return result

training(True)

Process text: 100%|██████████| 1/1 [00:00<00:00, 142.93it/s]


Total samples: 31
共7个批次，每批4个样本，每个样本是长度为 256 的 token 序列
Train loader:
torch.Size([4, 256]) torch.Size([4, 256])


Process text: 100%|██████████| 1/1 [00:00<00:00, 1000.07it/s]


Total samples: 4
Validation loader:
torch.Size([4, 256]) torch.Size([4, 256])


training:   0%|          | 0/10 [00:00<?, ?it/s]

EP: 1 STEP: 0 T_LOSS: 9.528 V_LOSS: 9.845




I turned to Mrs,,,,,,,,,,,,,,,,,,,,




EP: 1 STEP: 5 T_LOSS: 6.593 V_LOSS: 7.131




I turned to Mrs                    


training:  10%|█         | 1/10 [00:58<08:48, 58.78s/it]

EP: 2 STEP: 10 T_LOSS: 5.860 V_LOSS: 7.168




I turned to Mrs.                   


training:  20%|██        | 2/10 [01:52<07:28, 56.01s/it]

EP: 3 STEP: 15 T_LOSS: 5.914 V_LOSS: 7.279




I turned to Mrs the, the, the, the, the, the, the, the, the, the the




EP: 3 STEP: 20 T_LOSS: 5.782 V_LOSS: 7.477


training:  30%|███       | 3/10 [02:53<06:46, 58.08s/it]

I turned to Mrs.                   




EP: 4 STEP: 25 T_LOSS: 5.665 V_LOSS: 7.482




I turned to Mrs.                   


training:  40%|████      | 4/10 [03:48<05:40, 56.81s/it]

EP: 5 STEP: 30 T_LOSS: 5.444 V_LOSS: 7.475




I turned to Mrs.                   


training:  50%|█████     | 5/10 [04:41<04:36, 55.39s/it]

EP: 6 STEP: 35 T_LOSS: 5.534 V_LOSS: 7.535




I turned to Mrs.                   




EP: 6 STEP: 40 T_LOSS: 5.244 V_LOSS: 7.311




I turned to Mrs.  ".               


training:  60%|██████    | 6/10 [05:39<03:45, 56.34s/it]

EP: 7 STEP: 45 T_LOSS: 5.233 V_LOSS: 7.372




I turned to Mrs. " to the picture a of the picture a- the picture of the picture to the picture


training:  70%|███████   | 7/10 [06:34<02:47, 55.93s/it]

EP: 8 STEP: 50 T_LOSS: 4.812 V_LOSS: 7.358




I turned to Mrs.  ".               




EP: 8 STEP: 55 T_LOSS: 4.201 V_LOSS: 7.031


training:  80%|████████  | 8/10 [07:34<01:54, 57.30s/it]

I turned to Mrs.  ".               




EP: 9 STEP: 60 T_LOSS: 3.508 V_LOSS: 6.948




I turned to Mrs.                   


training:  90%|█████████ | 9/10 [08:29<00:56, 56.44s/it]

EP: 10 STEP: 65 T_LOSS: 3.000 V_LOSS: 6.942




I turned to Mrs.                   


training: 100%|██████████| 10/10 [09:23<00:00, 56.36s/it]


([9.527825546264648,
  6.592583560943604,
  5.8596649169921875,
  5.9144796371459964,
  5.782247638702392,
  5.6645583152771,
  5.443800163269043,
  5.533572578430176,
  5.244353771209717,
  5.233304786682129,
  4.812401390075683,
  4.201342678070068,
  3.5084914684295656,
  3.0002678871154784],
 [9.844965934753418,
  7.13070821762085,
  7.168105602264404,
  7.279229640960693,
  7.4770307540893555,
  7.48203182220459,
  7.475358963012695,
  7.535090923309326,
  7.310979843139648,
  7.371764659881592,
  7.358284950256348,
  7.0310258865356445,
  6.948094844818115,
  6.941988468170166],
 [1024,
  6144,
  11264,
  16384,
  21504,
  26624,
  31744,
  36864,
  41984,
  47104,
  52224,
  57344,
  62464,
  67584])

In [28]:
log_writer.close()

## Load the parameters of the model

### Load model

In [29]:
checkpoint = torch.load(modelpath, map_location=device)
model = GPTModel(GPT_CONFIG)
model.load_state_dict(checkpoint["model_state_dict"])
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
model.to(device)
model.eval() 

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttendtion_new(
        (W_q): Linear(in_features=768, out_features=768, bias=False)
        (W_k): Linear(in_features=768, out_features=768, bias=False)
        (W_v): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Dropout(p=0.1, inplace=False)
          (3): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttendtion_new

### Test the generation capability of the saved model

In [30]:
for _ in range(5):
    model_wrapper.generate_and_print(model,tokenizer,device,start_context,100,0.8,50,None)

I turned to Mrs. And his painting.      "How," she said. To the complex after to me out-rooms, and in the demandham found a hood. "Yes, he was dead out of loing he had not," she said.  "--had he liked that it. "I said. I turned"What.       He stood tell me--c. "Ah had not the picture dep in a
I turned to Mrs. Stroud! She sent for you say. "Ah, I couldn. "I could. Gisburn's that, passing Jack's only irre placed"id had begun to me, my picture.  "By of the ax't say on my companion.  "You the only in a trace of saying.    "Oh, the only between the fact, one might the Riv my.  " ( of the dining't say.  
I turned to Mrs. Mrs. I must he never, as a good he had forgotten againwas it was to put the last, my host to see it was _rose looking-c the man of the first-roomsisburn, poor. I don't say it to see it. "Why it. "Ah, who get to display of the dining. I had myself to see it.   "I remember-century old the fact. It was his pictures.  
I turned to Mrs.  "Well, and I had always their stood I felt