# GPT-2 Model


In [1]:
import sys
import os

# 检查是否在Jupyter环境中
try:
    # 如果是Jupyter，会定义这个变量
    get_ipython
    # 使用当前工作目录作为基准（Jupyter的工作目录）
    current_dir = os.getcwd()
except NameError:
    # 普通Python环境，使用__file__
    current_dir = os.path.dirname(os.path.abspath(__file__))

# 将父目录添加到Python路径（根据你的目录结构调整）
parent_dir = os.path.abspath(os.path.join(current_dir, '..'))
if parent_dir not in sys.path:
    sys.path.append(parent_dir)
    
import torch
import torch.nn as nn
from torch.testing import assert_close
torch.manual_seed(42)
import tool, loaddata ,model_wrapper,models
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter
import tiktoken

  from .autonotebook import tqdm as notebook_tqdm


dict_keys(['train'])
                                          completion            source
0  昭通机场（ZPZT）是位于中国云南昭通的民用机场，始建于1935年，1960年3月开通往返航...  wikipedia.zh2307
1  我的英雄学院：英雄新世纪\n《我的英雄学院剧场版：英雄新世纪》（仆のヒーローアカデミア TH...  wikipedia.zh2307
2  黄大仙文化公园（Wong Tai Sin Culture Park）是香港一个公园，位于九龙...  wikipedia.zh2307
3  佐洛奇夫（Zolochiv），或按俄语译为佐洛乔夫（Золочев），是乌克兰西部利沃夫州佐...  wikipedia.zh2307
4  陈准，字道基，颍川郡许昌（今河南许昌）人。西晋官员。官至太尉。出身颍川陈氏，青州刺史陈佐之子...  wikipedia.zh2307
254547
load data
函数 test_dummyModel 已跳过执行
函数 test_layer_norm 已跳过执行
函数 test_gelu 已跳过执行
函数 test_tokenizer 已跳过执行
函数 test_loss 已跳过执行
函数 test_dummyModel 已跳过执行
函数 test_layer_norm 已跳过执行
函数 test_gelu 已跳过执行
函数 test_tokenizer 已跳过执行
函数 test_loss 已跳过执行
函数 test_GPT2_model 已跳过执行


### Config

In [None]:
IS_SKIP_TEST =True
IS_EN =True
IS_TRAIN=False


GPT_CONFIG = {
    "num_epochs":2,
    "batch_size":4,
    "vocab_size": 50257,     # 词汇表大小
    "context_len": 256,  # 上下文长度
    "emb_dim": 768,          # 嵌入维度
    "n_heads": 8,           # 注意力头的数量
    "n_layers": 12,          # 层数
    "drop_rate": 0.1,        # dropout率
    "qkv_bias": False ,      # 查询-键-值偏置
}

TOKEN_TYPE="gpt2"
# TOKEN_TYPE="cl100k_base"
LR= 1e-3
WEIGHT_DECAY =0.1

EVAL_FREQ = 5
EVAL_ITER = 5


In [3]:
! nvidia-smi

Thu Aug 14 12:35:23 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.94                 Driver Version: 560.94         CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce GTX 960       WDDM  |   00000000:01:00.0  On |                  N/A |
| 30%   49C    P8             18W /  180W |    1134MiB /   4096MiB |     18%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

### Set device to (type='cuda')

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

### Tensorboard Log

In [5]:
# 创建一个日志写入器
import time
starttime = time.strftime("%Y-%m-%d_%H-%M-%S")
print("Start experiment:", starttime)
logpath ="../log/"
log_writer = SummaryWriter(log_dir=logpath+starttime[:16],comment=starttime[:16],flush_secs=60)#以实验时间命名


Start experiment: 2025-08-14_12-35-23


## Load Data

### Select which language of the text to load

In [6]:
if IS_EN :
    file_path ="../datasets/the-verdict.txt"
    train_data, valid_data =loaddata.load_data_en(file_path)
else:
    train_data, valid_data = loaddata.load_data_cn(True,0.8)
    
len(train_data)

total char: 20479
train char: 16383
valid char: 4096 



1

## GPTDataLoader

In [7]:
@tool.train_execution(IS_TRAIN)
def Dataloadering():
    train_loader = model_wrapper.GPTDataloader(
        train_data,
        TOKEN_TYPE,
        batch_size = GPT_CONFIG['batch_size'],
        max_len = GPT_CONFIG["context_len"],
        stride = GPT_CONFIG["context_len"] // 2, #适度重叠（stride = max_len // 2）
        drop_last=True,
        shuffle= True, #训练时打乱，验证 / 测试时不打乱：训练时打乱是为了提升泛化能力
        num_works=0   
        )

    print(F'共{len(train_loader)}个批次，'
        f'每批{train_loader.batch_size}个样本，'
        f'每个样本是长度为 {train_loader.dataset.max_len} 的 token 序列')


    print("Train loader:")
    if len(train_loader)<10: #只在测试的时候查看一下数据，数据过大，耗时很长
        x, y =next(iter(train_loader))
        print(x.shape, y.shape)


    valid_loader = model_wrapper.GPTDataloader(
        valid_data,
        TOKEN_TYPE,
        batch_size = GPT_CONFIG['batch_size'],
        max_len = GPT_CONFIG["context_len"],
        stride = GPT_CONFIG["context_len"] ,
        drop_last=False, # 验证 / 测试阶段：需要完整评估所有样本的性能，不能遗漏任何数据点
        shuffle= False, #验证 / 测试时不打乱是为了结果可复现，方便对比不同模型的性能
        num_works=0
        )

    print("Validation loader:")
    if len(valid_loader)<10:
        x, y =next(iter(valid_loader))
        print(x.shape, y.shape)
        
    return train_loader,valid_loader

## Training model
### Training process function

In [8]:

def train_model_process(model,train_loader,valid_loader,
                        optimizer,device,num_epochs,
                        eval_freq,eval_iter,
                        start_context,tokenizer):
    train_losses,val_losses= [],[]
    track_tokens_seen =[]
    tokens_seen,global_step = 0, -1
    for epoch in  tqdm(range(num_epochs), desc="training"):
        model.train() #开启训练 启用dropout、启用Batchnorm
        for input_batch,target_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}", leave=False):
            optimizer.zero_grad()
            loss = model_wrapper.calc_loss_batch(input_batch,target_batch,model,device) 
            loss.backward()
            optimizer.step()
            tokens_seen += input_batch.numel()
            global_step +=1
            
            if global_step % eval_freq == 0:
                train_loss, val_loss = model_wrapper.evaluate_model(model,train_loader,valid_loader,device,eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                log_writer.add_scalar('Training/Train_Loss', train_loss, global_step)
                log_writer.add_scalar('Training/Val_Loss', val_loss, global_step)
                print(f'EP: {epoch+1} STEP: {global_step} '
                    f'{f"T_LOSS: {train_loss:.3f}" if train_loss is not None else "T_LOSS: None "} '
                    f'{f"V_LOSS: {val_loss:.3f}" if val_loss is not None else "V_LOSS: None"}'
                    )
                model_wrapper.generate_and_print(model,tokenizer,device,start_context,20)
    return train_losses,val_losses,track_tokens_seen

In [9]:
# ! tensorboard --logdir=../log  --port=6006 --host=0.0.0.0  
# ssh -L 8888:localhost:6006 zzz@172.23.207.112    

In [10]:
modelpath ='../model/gpt2.pt'
start_context = "I turned to Mrs"
tokenizer = tiktoken.get_encoding(TOKEN_TYPE)


@tool.train_execution(IS_TRAIN)
def training(savemodel=False):
    model = models.GPTModel(GPT_CONFIG)
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(),lr=LR,weight_decay=WEIGHT_DECAY)
    train_loader,valid_loader = Dataloadering()
    result=train_model_process(model,train_loader,valid_loader,
                                                                optimizer,device,num_epochs=GPT_CONFIG['num_epochs'],
                                                                eval_freq=EVAL_FREQ,eval_iter=EVAL_ITER,
                                                                start_context=start_context,
                                                                tokenizer=tokenizer
                                                                )
    if savemodel:
        model_wrapper.savemodel(modelpath,model,optimizer,GPT_CONFIG)  
        
    return result

training(True)



Process text: 100%|██████████| 1/1 [00:00<00:00, 111.15it/s]


Total samples: 31
共7个批次，每批4个样本，每个样本是长度为 256 的 token 序列
Train loader:
torch.Size([4, 256]) torch.Size([4, 256])


Process text: 100%|██████████| 1/1 [00:00<00:00, 500.22it/s]


Total samples: 4
Validation loader:
torch.Size([4, 256]) torch.Size([4, 256])


training:   0%|          | 0/2 [00:00<?, ?it/s]

EP: 1 STEP: 0 T_LOSS: 9.528 V_LOSS: 9.845




I turned to Mrs,,,,,,,,,,,,,,,,,,,,




EP: 1 STEP: 5 T_LOSS: 6.593 V_LOSS: 7.131




I turned to Mrs                    


training:  50%|█████     | 1/2 [00:33<00:33, 33.12s/it]

EP: 2 STEP: 10 T_LOSS: 5.860 V_LOSS: 7.168




I turned to Mrs.                   


training: 100%|██████████| 2/2 [01:04<00:00, 32.03s/it]


([9.527825546264648, 6.592583560943604, 5.8596649169921875],
 [9.844965934753418, 7.13070821762085, 7.168105602264404],
 [1024, 6144, 11264])

In [11]:
log_writer.close()

## Load the parameters of the model

Load the model parameters saved for model checking

In [12]:
checkpoint = torch.load(modelpath, map_location=device)
config = checkpoint["config"]
print(config==GPT_CONFIG)
model = models.GPTModel(config)
model.load_state_dict(checkpoint["model_state_dict"])
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
model.to(device)


True


GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttendtion_new(
        (W_q): Linear(in_features=768, out_features=768, bias=False)
        (W_k): Linear(in_features=768, out_features=768, bias=False)
        (W_v): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Dropout(p=0.1, inplace=False)
          (3): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttendtion_new

### Test the generation capability of the saved model

In [13]:
for _ in range(5):
    model_wrapper.generate_and_print(model,tokenizer,device,start_context,100,0.8,50,1,50256)

I turned to Mrs he him painting----'t the it by his of a as was his was I as of the the for the been the on the's been,."'s him the to you him you up-- her--. him the." on you for me the ofroud was't been and him it about he you a." his in his theis had the my the it not have I toburn had to as with _ all " the-- _-'t a-- his about." G as on in
I turned to Mrs he androud it by his Jack--I- the-- was the that. his me had the.I in was it the be's been of by--is at my the,"," his--."I _ and his-- Iburn that to on his his I the her," me. G that the up of the. G," beenis that, itis to toburn-- of in foris as the-, and she the to of a of he-- all was a be't
I turned to Mrs me's to you and so with he of it-- so to had with, it been her. " that for up---- his myis. in, I me in ais you it an with of the to, and to-- that, but ofburn. you the my of.  such the on-'s, on herburn so." .-- him so," the! the for an the my of _ painting his, so in Iroud about a his I-- been a
I turned to Mrs be-- a he