# 彻底搞懂GPT是如何训练的

### 1. 加载数据集

In [2]:
from torch.utils.data import Dataset

dataset_path = "/raid/gfc/llm/datasets/ChinesePoems/poems.txt"
# 制作 Dataset
class MyDataset(Dataset):
    def __init__(self, dataset_path):
        with open(dataset_path, 'r', encoding='utf-8') as f:
            self.data = f.readlines()
        self.data = [line.strip() for line in self.data]
            
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# 加载数据集
dataset = MyDataset(dataset_path)
print(f"数据集大小: {len(dataset)}")

数据集大小: 217561


### 2. 加载模型和分词器

In [3]:
from transformers import BertTokenizer, GPT2LMHeadModel, TextGenerationPipeline
import torch

device = torch.device("cuda:7" if torch.cuda.is_available() else "cpu")

cache_dir = "/raid/gfc/llm/models"
tokenizer = BertTokenizer.from_pretrained("uer/gpt2-distil-chinese-cluecorpussmall", cache_dir=cache_dir)
model = GPT2LMHeadModel.from_pretrained("uer/gpt2-distil-chinese-cluecorpussmall", cache_dir=cache_dir).to(device)

print(tokenizer)
print('--' * 40)
print(model)

  from .autonotebook import tqdm as notebook_tqdm


BertTokenizer(name_or_path='uer/gpt2-distil-chinese-cluecorpussmall', vocab_size=21128, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
--------------------------------------------------------------------------------
GPT2LMHeadModel(
  (transf

### 3. 定义dataloader

In [4]:
def collate_fn(batch):
    data = tokenizer.batch_encode_plus(batch, 
                                       padding=True, 
                                       truncation=True, 
                                       max_length=512, 
                                       return_tensors="pt")
    data['labels'] = data['input_ids'].clone()
    return data

dataloader = torch.utils.data.DataLoader(
    dataset=dataset, 
    batch_size=16, shuffle=True, 
    drop_last=True, 
    collate_fn=collate_fn
    )
print(len(dataloader))

13597


### 4. 定义训练函数

In [11]:
import time

Epochs = 1

# 定义训练函数
def train():
    model.train()

    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=Epochs)
    # 梯度裁剪
    max_grad_norm = 1.0

    for epoch in range(Epochs):
        total_loss = 0
        num_batches = 0
        for i, batch in enumerate(dataloader):
            # 1. 对于GPT这类自回归（Causal Language Model, CLM）模型，labels和input_ids是一样的
            input_ids = batch['input_ids'].to(device) # [batch_size, seq_len]
            labels = batch['labels'].to(device)
            # 2. outputs包含loss,logits和past_key_values的字典
            outputs = model(input_ids, labels=labels)
            # print(outputs.logits.shape) # [batch_size, seq_len, vocab_size]
            loss = outputs.loss
            loss.backward()
            
            # 3. 梯度裁剪
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
              
            optimizer.step()
            optimizer.zero_grad()
            
            if i % 100 == 0:
                with torch.no_grad():
                    # 1. 切换到评估模式（关闭dropout等）
                    model.eval()
                    # 2. 取模型输出的预测，每个位置选概率最大的token
                    #    去掉最后一个token，因为最后一个token不需要再做预测
                    out = outputs.logits.argmax(dim=2)[:, :-1] # [batch_size, seq_len-1]

                    # 3. 取labels去掉第一个token，因为第一个token不会成为被预测的目标
                    labels = batch['labels'][:, 1:].to(device) # [batch_size, seq_len-1]

                    # 4. 只保留非padding部分（labels!=0），避免padding影响准确率
                    select = labels != 0
                    out = out[select]
                    labels = labels[select]

                    # 5. 计算准确率（预测正确的token数 / 有效token总数）
                    acc = (labels == out).sum().item() / labels.numel()
                    # 6. 获取当前学习率，打印日志
                    lr = optimizer.param_groups[0]['lr']
                    print(f"Epoch {epoch}, Step {i}, Lr {lr:.5e}, Loss {loss:.5f}, Acc {acc:.2%}")

                    # 7. 切回训练模式
                    model.train()
                    # 8. 删除变量释放显存
                    del select
                    
            total_loss += loss.item()
            num_batches += 1

        # 每个epoch结束后的操作
        avg_loss = total_loss / num_batches
        print(f"Epoch {epoch} completed. Average Loss: {avg_loss:.5f}")

        # 调整学习率
        scheduler.step()   
            

In [None]:
train()

Epoch 0, Step 0, Lr 2.00000e-05, Loss 9.62540, Acc 13.86%
Epoch 0, Step 100, Lr 2.00000e-05, Loss 3.33211, Acc 17.90%
Epoch 0, Step 200, Lr 2.00000e-05, Loss 2.25275, Acc 19.11%
