# GPT-2 Model


In [1]:
import sys
import os


try:
    get_ipython
    current_dir = os.getcwd()
except NameError:
    current_dir = os.path.dirname(os.path.abspath(__file__))
# Set path，temporary path expansion
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
if parent_dir not in sys.path:
    sys.path.append(parent_dir)
    
import torch
torch.manual_seed(42)
import tool, loaddata ,model_wrapper,models
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
import tiktoken

  from .autonotebook import tqdm as notebook_tqdm


load local data:
dict_keys(['train'])
                                          completion            source
0  昭通机场（ZPZT）是位于中国云南昭通的民用机场，始建于1935年，1960年3月开通往返航...  wikipedia.zh2307
1  我的英雄学院：英雄新世纪\n《我的英雄学院剧场版：英雄新世纪》（仆のヒーローアカデミア TH...  wikipedia.zh2307
2  黄大仙文化公园（Wong Tai Sin Culture Park）是香港一个公园，位于九龙...  wikipedia.zh2307
3  佐洛奇夫（Zolochiv），或按俄语译为佐洛乔夫（Золочев），是乌克兰西部利沃夫州佐...  wikipedia.zh2307
4  陈准，字道基，颍川郡许昌（今河南许昌）人。西晋官员。官至太尉。出身颍川陈氏，青州刺史陈佐之子...  wikipedia.zh2307
254547
函数 test_dummyModel 已跳过执行
函数 test_layer_norm 已跳过执行
函数 test_gelu 已跳过执行
函数 test_tokenizer 已跳过执行
函数 test_tokenizer_padding 已跳过执行
函数 test_loss 已跳过执行
函数 test_dummyModel 已跳过执行
函数 test_layer_norm 已跳过执行
函数 test_gelu 已跳过执行
函数 test_tokenizer 已跳过执行
函数 test_tokenizer_padding 已跳过执行
函数 test_loss 已跳过执行
Total number of parameters: 163,037,184
Number of trainable parameters considering weight tying: 124,439,808


### Config

In [2]:

IS_SKIP_TEST =True #开关测试代码

IS_EN =True # 中英文数据集切换

IS_TRAIN=True # 预训练开关

IS_MP =True # 是否混合精度

GPT_CONFIG = {
    "num_epochs":4,
    "batch_size":4,
    "vocab_size": 50257,     
    "context_len": 256,  
    "emb_dim": 768,          
    "n_heads": 8,          
    "n_layers": 12,         
    "drop_rate": 0.1,      
    "initializer_range":0.02,  
    "qkv_bias": False , 
    "num_experts":4,
    "expert_top_k":2,    
}

TOKEN_TYPE="gpt2"
# TOKEN_TYPE="cl100k_base"

LR= 1e-3
WEIGHT_DECAY =0.1

#评估频率
EVAL_FREQ = 5
EVAL_ITER = 5


In [3]:
! nvidia-smi

Thu Aug 28 13:07:42 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.94                 Driver Version: 560.94         CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce GTX 960       WDDM  |   00000000:01:00.0  On |                  N/A |
| 30%   43C    P8             18W /  180W |     593MiB /   4096MiB |     18%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

### Set device to (type='cuda')

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

### Tensorboard Log

In [5]:
# 创建一个日志写入器
import time
starttime = time.strftime("%Y-%m-%d_%H-%M-%S")
print("Start experiment:", starttime)
logpath ="../log/"
log_writer = SummaryWriter(log_dir=logpath+starttime[:16],comment=starttime[:16],flush_secs=60)#以实验时间命名


Start experiment: 2025-08-28_13-07-43


## Load Data

### Select which language of the text to load

In [6]:
if IS_EN :
    file_path ="../datasets/the-verdict.txt"
    train_data, valid_data =loaddata.load_data_en(file_path)
else:
    train_data, valid_data = loaddata.load_data_cn(True,0.8)
    
len(train_data)

total char: 20479
train char: 16383
valid char: 4096 



1

## GPTDataLoader

In [7]:
@tool.train_execution(IS_TRAIN)
def Dataloadering():
    train_loader = model_wrapper.GPTDataloader(
        train_data,
        TOKEN_TYPE,
        batch_size = GPT_CONFIG['batch_size'],
        max_len = GPT_CONFIG["context_len"],
        stride = GPT_CONFIG["context_len"] // 2, #适度重叠（stride = max_len // 2）
        drop_last=True,
        shuffle= True, #训练时打乱，验证 / 测试时不打乱：训练时打乱是为了提升泛化能力
        num_works=0   
        )

    print(F'共{len(train_loader)}个批次，'
        f'每批{train_loader.batch_size}个样本，'
        f'每个样本是长度为 {train_loader.dataset.max_len} 的 token 序列')


    print("Train loader:")
    if len(train_loader)<10: #只在测试的时候查看一下数据，数据过大，耗时很长
        x, y =next(iter(train_loader))
        print(x.shape, y.shape)


    valid_loader = model_wrapper.GPTDataloader(
        valid_data,
        TOKEN_TYPE,
        batch_size = GPT_CONFIG['batch_size'],
        max_len = GPT_CONFIG["context_len"],
        stride = GPT_CONFIG["context_len"] ,
        drop_last=False, # 验证 / 测试阶段：需要完整评估所有样本的性能，不能遗漏任何数据点
        shuffle= False, #验证 / 测试时不打乱是为了结果可复现，方便对比不同模型的性能
        num_works=0
        )

    print("Validation loader:")
    if len(valid_loader)<10:
        x, y =next(iter(valid_loader))
        print(x.shape, y.shape)
        
    return train_loader,valid_loader

## Mixed Precision
混合精度加速计算过程
autocast：

* 低精度计算：大部分模型操作（如线性层、卷积层、矩阵乘法、激活函数等）会自动使用 FP16（半精度浮点），利用其计算速度快、显存占用低的特点加速训练。
* 高精度保障：
    * 模型参数（权重、偏置）始终以 FP32（单精度浮点） 存储，避免精度损失累积。
    * 对精度敏感的操作（如 BatchNorm 层的均值 / 方差计算、softmax 归一化、损失函数计算等）会强制使用 FP32，防止数值不稳定（如下溢、精度丢失）

In [8]:
from torch.amp import GradScaler, autocast

if device.type !="cuda":
    IS_MP = False
    
# 初始化梯度缩放器（防止FP16梯度下溢）
scaler = GradScaler()

## Training model
### Training process function

In [9]:

def train_model_process(model,train_loader,valid_loader,
                        optimizer,device,num_epochs,
                        eval_freq,eval_iter,
                        start_context,tokenizer,checkpoint_path=None):
       # 初始化训练状态（如果有 checkpoint 则加载）
    if checkpoint_path and os.path.exists(checkpoint_path):
        print(f"Resuming from checkpoint: {checkpoint_path}")
        state = model_wrapper.load_checkpoint(model, optimizer, checkpoint_path)
        start_epoch = state['epoch']
        global_step = state['global_step']
        train_losses = state['train_losses']
        val_losses = state['val_losses']
        track_tokens_seen = state['track_tokens_seen']
        tokens_seen = track_tokens_seen[-1] if track_tokens_seen else 0
    else:
        # 从头开始训练
        start_epoch = 0
        global_step = -1
        train_losses, val_losses = [], []
        track_tokens_seen = []
        tokens_seen = 0


    for epoch in tqdm(range(start_epoch, num_epochs), desc="training"):
        model.train() #开启训练 启用dropout、启用Batchnorm
        freq =100 
        freq =100 if freq<len(train_loader) else  len(train_loader) 
        print (freq)
        for input_batch,target_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}", leave=False):
            optimizer.zero_grad()
            if IS_MP:
                with autocast(device.type,dtype=torch.float16):
                    loss = model_wrapper.calc_loss_batch(input_batch,target_batch,model,device) 
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update() 
            else:
                loss = model_wrapper.calc_loss_batch(input_batch,target_batch,model,device) 
                loss.backward()
                optimizer.step()
            
            tokens_seen += input_batch.numel()
            global_step +=1
            
           
            if global_step % eval_freq == 0:
                train_loss, val_loss = model_wrapper.evaluate_model(model,train_loader,valid_loader,device,eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                log_writer.add_scalar('Training/Train_Loss', train_loss, global_step)
                log_writer.add_scalar('Training/Val_Loss', val_loss, global_step)
                print(f'EP: {epoch+1} STEP: {global_step} '
                    f'{f"T_LOSS: {train_loss:.3f}" if train_loss is not None else "T_LOSS: None "} '
                    f'{f"V_LOSS: {val_loss:.3f}" if val_loss is not None else "V_LOSS: None"}'
                    )
                model_wrapper.generate_and_print(model,tokenizer,device,start_context,20)
            
            # 定期保存 checkpoint
            if global_step % freq == 0 and global_step != 0:
                model_wrapper.save_checkpoint(
                    model, optimizer, epoch, global_step,
                    train_losses, val_losses, track_tokens_seen,
                    f"../model/checkpoints/epoch_{epoch}_step_{global_step}.pt"
                )
    return train_losses,val_losses,track_tokens_seen

In [10]:
# tesorboard 调用代码
# ! tensorboard --logdir=../log  --port=6006 --host=0.0.0.0  
# ssh -L 8888:localhost:6006 zzz@172.23.207.112    

In [11]:
modelpath ='../model/gpt2.pt'
start_context = "I turned to Mrs"
tokenizer = tiktoken.get_encoding(TOKEN_TYPE)



@tool.train_execution(IS_TRAIN)
def training(savemodel=False):
    model = models.GPTModel_MOE_KVCache(GPT_CONFIG)
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(),lr=LR,weight_decay=WEIGHT_DECAY)
    train_loader,valid_loader = Dataloadering()
    result=train_model_process(model,train_loader,valid_loader,
                                                                optimizer,device,num_epochs=GPT_CONFIG['num_epochs'],
                                                                eval_freq=EVAL_FREQ,eval_iter=EVAL_ITER,
                                                                start_context=start_context,
                                                                tokenizer=tokenizer,
                                                                checkpoint_path='../model/checkpoints/0'
                                                                )
    if savemodel:
        model_wrapper.savemodel(modelpath,model,optimizer,GPT_CONFIG)  
        
    return result

training(True)



Process text: 100%|██████████| 1/1 [00:00<00:00, 130.91it/s]


Total samples: 31
共7个批次，每批4个样本，每个样本是长度为 256 的 token 序列
Train loader:
torch.Size([4, 256]) torch.Size([4, 256])


Process text: 100%|██████████| 1/1 [00:00<00:00, 570.58it/s]


Total samples: 4
Validation loader:
torch.Size([4, 256]) torch.Size([4, 256])


training:   0%|          | 0/4 [00:00<?, ?it/s]

7




EP: 1 STEP: 0 T_LOSS: 10.007 V_LOSS: 10.080




I turned to Mrskids conditional, Honest theValippliter Somali gearingomet��極◼, the GTAbs visitedCos Converted


training:   0%|          | 0/4 [01:01<?, ?it/s]


RuntimeError: CUDA error: the launch timed out and was terminated
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
log_writer.close()

## Load the parameters of the model

Load the model parameters saved for model checking

In [None]:
checkpoint = torch.load(modelpath, map_location=device)
config = checkpoint["config"]
print(config==GPT_CONFIG)
model = models.GPTModel_MOE_KVCache(config)
for name, param in checkpoint["model_state_dict"].items():
           print(f"{name}: {param.shape}")
model.load_state_dict(checkpoint["model_state_dict"])
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
model.to(device)


### Test the generation capability of the saved model

In [None]:
import importlib
importlib.reload(model_wrapper)

for _ in range(5):
    model_wrapper.generate_and_print(model,tokenizer,device,start_context,100,0.8,50,1,None,True)