## GRPO探索

### 准备环境与辅助函数

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import get_peft_model, LoraConfig, TaskType

# 模拟环境设置
device = "cuda" if torch.cuda.is_available() else "cpu"
GROUP_SIZE = 4  # 每组生成 4 个回答 (为了演示方便，实际建议 8)
BETA = 0.04     # KL 散度惩罚系数
CLIP_EPS = 0.2  # PPO Clip 阈值

def get_log_probs(logits, labels):
    """
    计算给定 token 序列的 log probability。
    
    logits: [Batch, Seq_Len, Vocab]
    labels: [Batch, Seq_Len]
    """
    # 1. 对 logits 进行 LogSoftmax
    log_probs = F.log_softmax(logits, dim=-1)
    
    # 2. 根据 labels 取出对应的 log_prob
    # gather 需要 index 和 src 维度一致
    log_probs_gathered = torch.gather(
        log_probs, 
        dim=-1, 
        index=labels.unsqueeze(-1)
    ).squeeze(-1)
    
    return log_probs_gathered

  from .autonotebook import tqdm as notebook_tqdm


### 核心 GRPO Loss 实现 (包含魔法细节)

In [2]:
class GRPOLoss(nn.Module):
    def __init__(self, clip_eps=0.2, beta=0.04):
        super().__init__()
        self.clip_eps = clip_eps
        self.beta = beta

    def forward(self, 
                old_log_probs,    # 生成时的 Log Probs (固定值, detach)
                new_log_probs,    # 当前训练模型的 Log Probs (带梯度)
                ref_log_probs,    # 基座模型的 Log Probs (用于 KL)
                rewards,          # 这一组生成的奖励分数 [Batch, Group]
                mask              # 掩码，只计算生成部分的 Loss [Batch*Group, Seq_Len]
                ):
        
        # ------------------------------------------------------------------
        # 1. 计算优势 Advantage (Group Relative)
        # ------------------------------------------------------------------
        # rewards shape: [Batch_Size, Group_Size]
        mean_rewards = rewards.mean(dim=1, keepdim=True)
        std_rewards = rewards.std(dim=1, keepdim=True) + 1e-8 # 防止除0
        
        # 标准化优势：同组内比较，谁好谁坏
        advantages = (rewards - mean_rewards) / std_rewards 
        
        # 展开 specific shape 以匹配 token 维度
        # [Batch, Group] -> [Batch * Group, 1]
        advantages = advantages.view(-1, 1) 
        
        # ------------------------------------------------------------------
        # 2. 计算 Ratio (重要性采样比率)
        # ------------------------------------------------------------------
        # ratio = exp(new - old) = p_new / p_old
        # 注意：这里是 token-level 的 ratio
        ratio = torch.exp(new_log_probs - old_log_probs)
        
        # ------------------------------------------------------------------
        # 3. PPO Clip Loss (Policy Loss)
        # ------------------------------------------------------------------
        surr1 = ratio * advantages
        surr2 = torch.clamp(ratio, 1.0 - self.clip_eps, 1.0 + self.clip_eps) * advantages
        
        # 我们希望最大化优势，所以 Loss 取负号
        policy_loss = -torch.min(surr1, surr2)
        
        # 应用 Mask：只计算 Completion 部分的 loss，Padding 和 Prompt 部分不计算
        policy_loss = (policy_loss * mask).sum() / mask.sum()

        # ------------------------------------------------------------------
        # 4. KL Divergence Penalty (Regularization)
        # ------------------------------------------------------------------
        # 近似 KL: log_p - log_ref
        # 也就是：模型生成的概率 vs 基座模型生成的概率 的差距
        kl_div = torch.exp(ref_log_probs - new_log_probs) - (ref_log_probs - new_log_probs) - 1
        # 或者简单的：kl_div = new_log_probs - ref_log_probs
        
        kl_loss = (kl_div * mask).sum() / mask.sum()
        
        # ------------------------------------------------------------------
        # 5. 总 Loss
        # ------------------------------------------------------------------
        total_loss = policy_loss + self.beta * kl_loss
        
        return total_loss, policy_loss, kl_loss

### 完整的训练循环 (Training Loop)

In [None]:
# ==========================================
# 1. 模型初始化 (LoRA 模式)
# ==========================================
model_name = "Qwen/Qwen2.5-1.5B-Instruct" # 举例
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    torch_dtype=torch.bfloat16, 
    device_map=device
)

# 配置 LoRA
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    inference_mode=False, 
    r=8, 
    lora_alpha=32, 
    lora_dropout=0.1
)
model = get_peft_model(base_model, peft_config) # 现在的 model 是 Actor
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
grpo_loss_fn = GRPOLoss(clip_eps=CLIP_EPS, beta=BETA)

# ==========================================
# 2. 模拟一个 Batch 的 Prompt
# ==========================================
prompts = ["计算 123 + 456", "查询北京天气"]
# 实际场景：inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(device)
# 为了简化代码，假设 batch_size=1
input_text = "User: Calculate 3+3. Assistant: <think>"
inputs = tokenizer(input_text, return_tensors="pt").to(device)
prompt_len = inputs.input_ids.shape[1]

# ==========================================
# 3. 训练循环 (Training Step)
# ==========================================
model.train()
optimizer.zero_grad()

# --- Step A: Rollout (采样生成) ---
# 这一步不需要梯度，我们是在造数据
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        do_sample=True,          # 必须开启采样！
        temperature=0.9,         # 增加熵
        top_p=0.95,
        max_new_tokens=50,
        num_return_sequences=GROUP_SIZE  # 关键：一次生成 G 个
    )
    # outputs shape: [Group_Size, Seq_Len]
    
    # 构造 old_log_probs (旧策略产生的概率)
    # 此时 model 还没更新，所以就是 model 本身
    logits = model(outputs).logits
    # Shift logits: 预测下一个 token，所以 logits 往前移，labels 往后对齐
    logits = logits[:, :-1, :]
    labels = outputs[:, 1:]
    
    old_log_probs = get_log_probs(logits, labels)
    
    # 构造 ref_log_probs (基座模型的概率)
    # 技巧：在 PeftModel 中，disable_adapter() 就可以变回基座模型
    with model.disable_adapter():
        ref_logits = model(outputs).logits[:, :-1, :]
        ref_log_probs = get_log_probs(ref_logits, labels)

# --- Step B: Reward Calculation (打分) ---
# 这里你需要接你的 Reward Function
# 假设 outputs 的文本中包含 json，我们解析并打分
generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
rewards = []
for text in generated_texts:
    # 简单模拟：如果包含 "6" 且格式正确，得 1 分，否则 0 分
    score = 1.0 if "6" in text else 0.0
    rewards.append(score)

# 转换为 Tensor [1, Group_Size]
rewards_tensor = torch.tensor(rewards).to(device).unsqueeze(0) 

# --- Step C: Forward Pass (当前策略) ---
# 这一次需要梯度！
# 我们把生成的 outputs 当作输入，再跑一次前向传播
outputs_inputs = outputs # [Group_Size, Seq_Len]

# 这里的 logits 是带梯度的
new_logits = model(outputs_inputs).logits[:, :-1, :]
new_labels = outputs_inputs[:, 1:]
new_log_probs = get_log_probs(new_logits, new_labels)

# --- Step D: Masking (只算生成部分的 Loss) ---
# 创建一个 mask，长度等于 labels 的长度
mask = torch.zeros_like(new_labels, dtype=torch.float32)
# Prompt 部分设为 0，生成部分设为 1
# 注意：generate 包含 prompt，所以我们要把 prompt_len 之后的设为 1
mask[:, prompt_len-1:] = 1.0 
# 如果有 Padding token，也要 mask 掉
mask[new_labels == tokenizer.pad_token_id] = 0.0

# --- Step E: 计算 Loss ---
loss, policy_loss, kl_loss = grpo_loss_fn(
    old_log_probs=old_log_probs, # no_grad
    new_log_probs=new_log_probs, # grad_fn
    ref_log_probs=ref_log_probs, # no_grad
    rewards=rewards_tensor,
    mask=mask
)

print(f"Total Loss: {loss.item():.4f}, Policy Loss: {policy_loss.item():.4f}, KL: {kl_loss.item():.4f}")

# --- Step F: 反向传播 ---
loss.backward()

# 梯度裁剪 (防止梯度爆炸)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

optimizer.step()

print("Step Finished. Parameters updated.")