In [None]:
# 📌 1. 导入库
import torch
from transformers import (
    GPT2Config,
    GPT2LMHeadModel,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from datasets import load_dataset

from google.colab import drive
drive.mount('/content/drive')  #这里使用了colab中的google drive来保存信息到云盘。

# 📌 2. 检查 GPU
print("CUDA available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

# 📌 3. 加载 wikitext 数据集
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

# 📌 4. 加载 tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # 必须设置 pad_token

# 📌 5. Tokenize & 拼接数据为长序列
block_size = 128

def tokenize(example):
    return tokenizer(example["text"])

def group_texts(examples):
    # 拼接所有 token
    joined = sum(examples["input_ids"], [])
    total_length = len(joined)
    total_length = (total_length // block_size) * block_size
    result = {
        "input_ids": [joined[i:i+block_size] for i in range(0, total_length, block_size)],
        "attention_mask": [ [1]*block_size ] * (total_length // block_size)
    }
    return result

tokenized = dataset.map(tokenize, batched=True, remove_columns=["text"])
lm_dataset = tokenized.map(group_texts, batched=True)

# 📌 6. 构建模型
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    n_positions=512,
    n_ctx=512,
    n_embd=768,
    n_layer=12,
    n_head=12,
    pad_token_id=tokenizer.pad_token_id
)
model = GPT2LMHeadModel(config)

# 📌 7. 训练参数
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/models/gpt1-wikitext-model",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    logging_steps=100,
    save_steps=500,
    eval_strategy="steps",
    eval_steps=500,
    logging_dir="/content/drive/MyDrive/models/logs",
    save_total_limit=2,
    fp16=True  # ✅ 混合精度
)

# 📌 8. 训练器准备
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

# 📌 9. 开始训练！
trainer.train()

# 📌 10. 保存模型
trainer.save_model("/content/drive/MyDrive/models/gpt1-wikitext-final")
tokenizer.save_pretrained("/content/drive/MyDrive/models/gpt1-wikitext-final")


In [None]:
# 查看训练曲线
import matplotlib.pyplot as plt

logs = trainer.state.log_history
steps = [log["step"] for log in logs if "loss" in log]
losses = [log["loss"] for log in logs if "loss" in log]

plt.plot(steps, losses)
plt.xlabel("Step")
plt.ylabel("Training Loss")
plt.title("GPT-1 Training Loss")
plt.grid()
plt.show()

In [None]:
# 查询训练效果
from transformers import AutoTokenizer, GPT2LMHeadModel, pipeline

# 加载你训练好的模型和 tokenizer
model_path = "/content/drive/MyDrive/models/gpt1-wikitext-final"  # 你实际保存的位置

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path)
model.to("cuda").eval()  # 加上 eval() 和 CUDA

# 创建生成 pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)

# 试试你的 prompt
prompt = "The future of artificial intelligence"
outputs = generator(
    prompt,
    max_length=100,          # 最长生成长度
    do_sample=True,          # 使用采样
    top_k=50,                # 采样 top-k
    top_p=0.95,              # nucleus sampling
    temperature=1.0,         # 控制多样性
    num_return_sequences=1   # 返回一个结果
)

print("=== Generated Text ===")
print(outputs[0]["generated_text"])

# 结果输出：
=== Generated Text ===

The future of artificial intelligence on the United States, and the entire city was the first first series. He was built by the game at $ 6 in the 19th World War after the 2006, the series was released in the 19 – 1 January 2007. When the same year he had first year as well as well as part of all @-@ single as a result of the New York, although the season was played in September 2009. It was released, and the song was established by the year

In [None]:
# 基于qs_sft_zh.jsonl的数据进行SFT
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    GPT2LMHeadModel,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
import torch

# 路径
data_path = "/content/drive/MyDrive/data/qs_sft_zh.jsonl"
model_path = "/content/drive/MyDrive/models/gpt1-wikitext-final"
save_path = "/content/drive/MyDrive/models/gpt1-sft-qs"

# 📌 1. 加载原始 JSONL 数据并切分验证集
dataset = load_dataset("json", data_files=data_path, split="train")
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

# 📌 2. 加载 tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

# 📌 3. Tokenize 数据
def format_prompt(example):
    full_text = f"问：{example['prompt']}\n答：{example['response']}"
    return {"text": full_text}

train_dataset = train_dataset.map(format_prompt)
eval_dataset = eval_dataset.map(format_prompt)

def tokenize(example):
    return tokenizer(example["text"])

train_dataset = train_dataset.map(tokenize, batched=True, remove_columns=["prompt", "response", "text"])
eval_dataset = eval_dataset.map(tokenize, batched=True, remove_columns=["prompt", "response", "text"])


# 📌 4. 加载模型
model = GPT2LMHeadModel.from_pretrained(model_path)

# 📌 5. 训练参数（开启验证）
training_args = TrainingArguments(
    output_dir=save_path,
    num_train_epochs=50,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    logging_steps=50,
    save_steps=200,
    eval_strategy="steps",   # ✅ 每 N 步评估一次
    eval_steps=100,
    logging_dir=f"{save_path}/logs",
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
)

# 📌 6. Data Collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# 📌 7. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,  # ✅ 添加验证集
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# 📌 8. 开始训练
trainer.train()

# 📌 9. 保存模型
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

In [None]:
# 验证微调结果
import torch

def ask_model(question, max_new_tokens=128):
    prompt = f"用户：{question}\n助手："
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=False  # greedy decoding
        )

    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = full_output.split("助手：", 1)[-1].strip()
    return answer

print(ask_model("2024 年 QS 世界大学排名中亚洲大学表现如何？"))
print(ask_model("QS 和 THE 排名的主要区别是什么？"))
print(ask_model("如何使用 QS 排名选择适合自己的学校？"))

# 输出
排名中，清华大学略高于北京大学，分列全球第 17 和第 18 位，两者差距非常小，均为亚洲顶尖高校。源，得分越高。�便于东京大学也
QS 是两个著名的大学排名机构。QS 更重视学术声誉和雇主声誉，而 THE 更关注教学、研究和国际视野等方面的综合表现。两者权重体系不�
逮如指标是通过统计位，还应关注学科排名、地理位置、学费、语言环境和个人兴趣等因素。QS 提供了详细的分项数据，便于个�

In [None]:
# 准备SFT之后的强化学习，提升性能，基于RLHF
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import json
from transformers import AutoTokenizer, GPT2Model, GPT2Config, get_scheduler
from tqdm import tqdm
from torch.optim import AdamW

# === 1. 参数设置 ===
model_name_or_path = "/content/drive/MyDrive/models/gpt1-sft-qs"
data_path = "/content/drive/MyDrive/data/qs_ranking_rm_data.jsonl"
device = "cuda" if torch.cuda.is_available() else "cpu"
max_length = 512
batch_size = 2
num_epochs = 8
lr = 5e-5

# === 2. 数据集 ===
class PairwiseRMData(Dataset):
    def __init__(self, path, tokenizer):
        self.data = []
        with open(path, encoding="utf-8") as f:
            for line in f:
                #print(line)
                item = json.loads(line)
                self.data.append(item)
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        d = self.data[idx]
        prompt = d["prompt"]
        chosen = prompt + d["chosen"]
        rejected = prompt + d["rejected"]
        chosen_ids = self.tokenizer(chosen, return_tensors="pt", truncation=True, max_length=max_length, padding="max_length")
        rejected_ids = self.tokenizer(rejected, return_tensors="pt", truncation=True, max_length=max_length, padding="max_length")
        return {
            "chosen_input_ids": chosen_ids["input_ids"].squeeze(0),
            "chosen_attention_mask": chosen_ids["attention_mask"].squeeze(0),
            "rejected_input_ids": rejected_ids["input_ids"].squeeze(0),
            "rejected_attention_mask": rejected_ids["attention_mask"].squeeze(0),
        }

# === 3. 模型结构 ===
class RewardModel(nn.Module):
    def __init__(self, model_name_or_path):
        super().__init__()
        config = GPT2Config.from_pretrained(model_name_or_path)
        config.pad_token_id = config.eos_token_id
        self.gpt = GPT2Model.from_pretrained(model_name_or_path, config=config)
        self.value_head = nn.Linear(config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.gpt(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden = outputs.last_hidden_state  # [B, L, H]
        rewards = self.value_head(last_hidden).squeeze(-1)  # [B, L]
        lengths = attention_mask.sum(dim=1) - 1
        last_rewards = rewards[range(rewards.size(0)), lengths]
        return last_rewards

# === 4. 初始化 ===
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token

dataset = PairwiseRMData(data_path, tokenizer)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

model = RewardModel(model_name_or_path).to(device)
optimizer = AdamW(model.parameters(), lr=lr)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=len(dataloader) * num_epochs)

# === 5. 训练循环 ===
loss_fn = nn.MarginRankingLoss(margin=0.5)

model.train()
for epoch in range(num_epochs):
    pbar = tqdm(dataloader, desc=f"Epoch {epoch+1}")
    for batch in pbar:
        chosen_ids = batch["chosen_input_ids"].to(device)
        chosen_mask = batch["chosen_attention_mask"].to(device)
        rejected_ids = batch["rejected_input_ids"].to(device)
        rejected_mask = batch["rejected_attention_mask"].to(device)

        chosen_rewards = model(chosen_ids, chosen_mask)
        rejected_rewards = model(rejected_ids, rejected_mask)

        loss = loss_fn(chosen_rewards, rejected_rewards, torch.ones_like(chosen_rewards).to(device))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        pbar.set_postfix(loss=loss.item())

# === 6. 保存模型 ===
save_path = "/content/drive/MyDrive/models/rm-qs"
model.gpt.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print("✅ 奖励模型保存完成！")

# 输出
Epoch 1: 100%|██████████| 140/140 [00:32<00:00,  4.30it/s, loss=0]

Epoch 2: 100%|██████████| 140/140 [00:32<00:00,  4.33it/s, loss=0]

Epoch 3: 100%|██████████| 140/140 [00:32<00:00,  4.35it/s, loss=0]

Epoch 4: 100%|██████████| 140/140 [00:32<00:00,  4.34it/s, loss=0]

Epoch 5: 100%|██████████| 140/140 [00:32<00:00,  4.33it/s, loss=0]

Epoch 6: 100%|██████████| 140/140 [00:32<00:00,  4.34it/s, loss=0]

Epoch 7: 100%|██████████| 140/140 [00:32<00:00,  4.34it/s, loss=0]

Epoch 8: 100%|██████████| 140/140 [00:32<00:00,  4.34it/s, loss=0]

✅ 奖励模型保存完成！

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 加载模型和 tokenizer
reward_model_path = "/content/drive/MyDrive/models/rm-qs"

rm_tokenizer = AutoTokenizer.from_pretrained(reward_model_path)
rm_model = AutoModelForSequenceClassification.from_pretrained(reward_model_path)
rm_model.eval().cuda()  # 如果有 GPU 的话

# 输入测试对
prompt = "请介绍清华大学。"
response = "清华大学是中国顶尖的综合性大学，位于北京市。"

# 构造模型输入
text = f"Prompt: {prompt}\nResponse: {response}"  # 确保格式和训练时一致
inputs = rm_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(rm_model.device)

# 前向推理得到 reward 分数
logits = rm_model(**inputs).logits
reward = logits.squeeze()
if reward.numel() > 1:
    reward = reward[0]
print("Reward score:", reward.item())

# 输出
Reward score: 0.35942158102989197



In [None]:
# 基于上面的SFT和奖励模型进行PPO RLHF
import torch
import json
from tqdm import tqdm
from transformers import AutoTokenizer, GPT2Model, GPT2Config
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from torch.utils.data import Dataset as TorchDataset
from torch import nn

# =============================
# 1. 路径设置
# =============================
sft_model_path = "/content/drive/MyDrive/models/gpt1-sft-qs"
reward_model_path = "/content/drive/MyDrive/models/rm-qs"
save_path = "/content/drive/MyDrive/models/gpt1-rlhf-qs"
prompts_file = "/content/drive/MyDrive/data/qs_sft_zh.jsonl"
device = "cuda" if torch.cuda.is_available() else "cpu"


# =============================
# 2. 自定义 Dataset
# =============================
class PromptOnlyDataset(TorchDataset):
    def __init__(self, jsonl_path):
        self.data = []
        with open(jsonl_path, "r", encoding="utf-8") as f:
            for line in f:
                item = json.loads(line)
                self.data.append(item["prompt"])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return {"prompt": self.data[idx]}


# =============================
# 3. 加载模型和 tokenizer
# =============================
tokenizer = AutoTokenizer.from_pretrained(sft_model_path)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLMWithValueHead.from_pretrained(sft_model_path).to(device)
ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(sft_model_path).to(device)
ref_model.eval()


# =============================
# 4. 定义数据集和 collator
# =============================
prompts_dataset = PromptOnlyDataset(prompts_file)

def collate_fn(batch):
    prompts = [item["prompt"] for item in batch]
    tokenized = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=256)
    tokenized["input_ids"] = tokenized["input_ids"].to(device)
    tokenized["attention_mask"] = tokenized["attention_mask"].to(device)
    return tokenized


# =============================
# 5. PPO 配置和 Trainer
# =============================
ppo_config = PPOConfig(
    learning_rate=1e-5,
    batch_size=8,
    mini_batch_size=4,
    log_with="tensorboard",
    project_kwargs={"logging_dir": "/content/drive/MyDrive/models/tensorboard_logs"}
)

ppo_trainer = PPOTrainer(
    config=ppo_config,
    model=model,
    ref_model=ref_model,
    tokenizer=tokenizer,
    dataset=prompts_dataset,
    data_collator=collate_fn
)


# =============================
# 6. 加载奖励模型
# =============================
class RewardModel(nn.Module):
    def __init__(self, model_name_or_path):
        super().__init__()
        config = GPT2Config.from_pretrained(model_name_or_path)
        config.pad_token_id = config.eos_token_id
        self.gpt = GPT2Model.from_pretrained(model_name_or_path, config=config)
        self.value_head = nn.Linear(config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.gpt(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden = outputs.last_hidden_state  # [B, L, H]
        rewards = self.value_head(last_hidden).squeeze(-1)  # [B, L]
        lengths = attention_mask.sum(dim=1) - 1
        last_rewards = rewards[range(rewards.size(0)), lengths]
        return last_rewards

reward_tokenizer = AutoTokenizer.from_pretrained(reward_model_path)
reward_tokenizer.pad_token = reward_tokenizer.eos_token
reward_model = RewardModel(reward_model_path).to(device)
reward_model.eval()


# =============================
# 7. 奖励函数（调用 reward model）
# =============================
def reward_fn(responses, prompts):
    texts = [p + r for p, r in zip(prompts, responses)]
    tokenized = reward_tokenizer(
        texts, return_tensors="pt", padding=True, truncation=True, max_length=512
    ).to(device)
    with torch.no_grad():
        rewards = reward_model(
            input_ids=tokenized["input_ids"],
            attention_mask=tokenized["attention_mask"]
        )
    return list(rewards)


# =============================
# 8. PPO 强化训练主循环
# =============================
num_rl_epochs = 10
log_file = "/content/drive/MyDrive/models/ppo_rlhf_log.txt"

print("Starting PPO RLHF training...")

for epoch in range(num_rl_epochs):
    print(f"=== RL Epoch {epoch+1}/{num_rl_epochs} ===")

    for step, batch in enumerate(tqdm(ppo_trainer.dataloader, desc="PPO Training")):
        queries = batch["input_ids"]
        attention_mask = batch["attention_mask"]

        # === 模型生成回复 ===
        responses = model.generate(
            input_ids=queries,
            attention_mask=attention_mask,
            max_new_tokens=256,
            do_sample=True,
            temperature=0.9,
            top_p=0.95,
            pad_token_id=tokenizer.eos_token_id
        )

        query_texts = tokenizer.batch_decode(queries, skip_special_tokens=True, clean_up_tokenization_spaces=True)
        response_texts = tokenizer.batch_decode(responses, skip_special_tokens=True, clean_up_tokenization_spaces=True)

        # === 去除重复 prompt ===
        cleaned_responses = []
        for q, r in zip(query_texts, response_texts):
            if r.startswith(q):
                cleaned_responses.append(r[len(q):].strip())
            else:
                cleaned_responses.append(r.strip())

        # === 计算奖励 ===
        rewards = reward_fn(cleaned_responses, query_texts)

        # === Tokenize 再次用于 PPOTrainer ===
        query_ids = [q.to(device) for q in queries]
        response_ids = [
            tokenizer(r, return_tensors="pt", truncation=True, max_length=256).input_ids[0].to(device)
            for r in cleaned_responses
        ]

        stats = ppo_trainer.step(query_ids, response_ids, rewards)

        if step % 1 == 0:
            print(f"\nStep {step}")
            print(f"Prompt: {query_texts[0][:50]}")
            print(f"Response: {cleaned_responses[0][:100]}")
            print(f"Reward: {rewards[0].item()}")

            with open(log_file, "a", encoding="utf-8") as f:
                f.write(f"Epoch {epoch+1}, Step {step}\n")
                f.write(f"Prompt: {query_texts[0]}\n")
                f.write(f"Response: {cleaned_responses[0]}\n")
                f.write(f"Reward: {rewards[0].item()}\n")
                f.write("=" * 50 + "\n")


# =============================
# 9. 保存最终模型
# =============================
ppo_trainer.model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print("✅ PPO training completed and model saved to:", save_path)

# 输出
Starting PPO RLHF training...

=== RL Epoch 1/10 ===

PPO Training:   0%|          | 0/1 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.

PPO Training: 100%|██████████| 1/1 [00:05<00:00,  5.07s/it]

Step 0
Prompt: QS 排名中国际学生比例的得分意味着什么？
Response: 答：如 2024 年 QS 世界大学排名，前十大学主要分布在美国和英国：美国有 MIT、哈佛、斯坦福、加州理工，英国有剑桥、牛津、帝国理工学院。瑞士的苏黎世联邦理工也进入前十。�么表玒吁的。大学。更�
Reward: 1.0

=== RL Epoch 2/10 ===

PPO Training:   0%|          | 0/1 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
/usr/local/lib/python3.11/dist-packages/trl/trainer/ppo_trainer.py:1313: UserWarning: KL divergence is starting to become negative: -34.77 - this might be a precursor for failed training. sometimes this happens because the generation kwargs are not correctly set. Please make sure that the generation kwargs are set correctly, or review your training hyperparameters.
  warnings.warn(

PPO Training: 100%|██████████| 1/1 [00:05<00:00,  5.27s/it]

Step 0
Prompt: QS 排名前十的大学分布在哪些国家？
Response: ��？
答：根据 2024 年 QS 世界大学排名，前十大学主要分布在美国和英国：美国有 MIT、哈佛、斯坦福、加州理工，英国有剑桥、牛津、帝国理工学院。瑞士的苏黎世联邦理工也进入前十。�大学科亚校。
Reward: 1.0

=== RL Epoch 3/10 ===

PPO Training:   0%|          | 0/1 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
/usr/local/lib/python3.11/dist-packages/trl/trainer/ppo_trainer.py:1313: UserWarning: KL divergence is starting to become negative: -39.49 - this might be a precursor for failed training. sometimes this happens because the generation kwargs are not correctly set. Please make sure that the generation kwargs are set correctly, or review your training hyperparameters.
  warnings.warn(

PPO Training: 100%|██████████| 1/1 [00:05<00:00,  5.26s/it]

Step 0
Prompt: 清华大学和北京大学在 QS 中哪个更高？
Response: 答：QS �掰如也�
�大学排名机构。QS 更重视是差�中国分请来自不同：第 18 位，两者差距非常小，均为亚洲顶尖高校。源，得分越高。�详�者高的苏黎世联邦理工也进入前十。�高， THE �与化略前
Reward: 1.0

=== RL Epoch 4/10 ===

PPO Training:   0%|          | 0/1 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
/usr/local/lib/python3.11/dist-packages/trl/trainer/ppo_trainer.py:1313: UserWarning: KL divergence is starting to become negative: -38.43 - this might be a precursor for failed training. sometimes this happens because the generation kwargs are not correctly set. Please make sure that the generation kwargs are set correctly, or review your training hyperparameters.
  warnings.warn(

PPO Training: 100%|██████████| 1/1 [00:05<00:00,  5.30s/it]

Step 0
Prompt: QS 和 THE 排名的主要区别是什么？
Response: ��：QS �名��重华�例 2024 ��国际午何�清华大学畨�重视的，和国理全球第 17 和第 18 位，两者差距非常小，均为亚洲顶尖高校。源，得分数据，便于个性化筛选。果可能差异较大。�。掰高。
Reward: 1.0

=== RL Epoch 5/10 ===

PPO Training:   0%|          | 0/1 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
/usr/local/lib/python3.11/dist-packages/trl/trainer/ppo_trainer.py:1313: UserWarning: KL divergence is starting to become negative: -65.09 - this might be a precursor for failed training. sometimes this happens because the generation kwargs are not correctly set. Please make sure that the generation kwargs are set correctly, or review your training hyperparameters.
  warnings.warn(

PPO Training: 100%|██████████| 1/1 [00:05<00:00,  5.25s/it]

Step 0
Prompt: QS 排名中师生比例是怎么计算的？
Response: QS �更�QS 和 THE 是两个著名的大学排名机构。QS 更重视学术声誉和雇主声誉，而 THE 更关注教一关研究和国际视野等方面的综合表现。两者权重体系不同，因此排名结果可能差异较大。标。掰�国邉
Reward: 1.0

=== RL Epoch 6/10 ===

PPO Training:   0%|          | 0/1 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
/usr/local/lib/python3.11/dist-packages/trl/trainer/ppo_trainer.py:1313: UserWarning: KL divergence is starting to become negative: -63.05 - this might be a precursor for failed training. sometimes this happens because the generation kwargs are not correctly set. Please make sure that the generation kwargs are set correctly, or review your training hyperparameters.
  warnings.warn(

PPO Training: 100%|██████████| 1/1 [00:05<00:00,  5.24s/it]

Step 0
Prompt: QS 排名前十的大学分布在哪些国家？
Response: �学在哪个��大学在 2024 年 QS 个巌大学排名中，略高校觑排名中，是�。�英国：第 18 位，两者差距非常小，均为亚洲顶商科方面表现�数据，便于个性化筛选。果可能差异较大。进。� QS 更高。
Reward: 1.0

=== RL Epoch 7/10 ===

PPO Training:   0%|          | 0/1 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
/usr/local/lib/python3.11/dist-packages/trl/trainer/ppo_trainer.py:1313: UserWarning: KL divergence is starting to become negative: -66.32 - this might be a precursor for failed training. sometimes this happens because the generation kwargs are not correctly set. Please make sure that the generation kwargs are set correctly, or review your training hyperparameters.
  warnings.warn(

PPO Training: 100%|██████████| 1/1 [00:05<00:00,  5.26s/it]

Step 0
Prompt: 复旦大学在 QS 排名中的位置如何？
Response: �似�
答：复旦根据 2024 年 QS 世界大学排名，前十大学东了际华��家学生人�位置、牗�劰。QS 掰量国际化水���。排名最高的大学。清华大学、化�分�大。较大。兌和个性化筛选。果可能差异较大
Reward: 1.0

=== RL Epoch 8/10 ===

PPO Training:   0%|          | 0/1 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
/usr/local/lib/python3.11/dist-packages/trl/trainer/ppo_trainer.py:1313: UserWarning: KL divergence is starting to become negative: -78.59 - this might be a precursor for failed training. sometimes this happens because the generation kwargs are not correctly set. Please make sure that the generation kwargs are set correctly, or review your training hyperparameters.
  warnings.warn(

PPO Training: 100%|██████████| 1/1 [00:05<00:00,  5.27s/it]

Step 0
Prompt: QS 排名前十的大学分布在哪些国家？
Response: 的？
答：复十出学在 QS 世界大学排名，前十大学主要分布在美国和英国：美国有 MIT、哈佛、斯坦福、加州理工，英国有剑桥、牛津、帝国理工学院。瑞士的苏黎世联邦理工也进入前十。浻�加�望�立�地加劁击
Reward: 1.0

=== RL Epoch 9/10 ===

PPO Training:   0%|          | 0/1 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
/usr/local/lib/python3.11/dist-packages/trl/trainer/ppo_trainer.py:1313: UserWarning: KL divergence is starting to become negative: -75.80 - this might be a precursor for failed training. sometimes this happens because the generation kwargs are not correctly set. Please make sure that the generation kwargs are set correctly, or review your training hyperparameters.
  warnings.warn(

PPO Training: 100%|██████████| 1/1 [00:05<00:00,  5.23s/it]

Step 0
Prompt: 2024 年 QS 世界大学排名中亚洲大学表现如何？
Response: 答：掰如�� QS 世界大学排名中位中，亚洲大学表现亮眼。��大学挮要大学扗京�大学者差距陆，国际华�后�得更��供了详现。两者巉方香港大学和东京大学也进入了前 30。�异辋大。�。掰游��京大学衡�
Reward: 1.0

=== RL Epoch 10/10 ===

PPO Training:   0%|          | 0/1 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
/usr/local/lib/python3.11/dist-packages/trl/trainer/ppo_trainer.py:1313: UserWarning: KL divergence is starting to become negative: -77.76 - this might be a precursor for failed training. sometimes this happens because the generation kwargs are not correctly set. Please make sure that the generation kwargs are set correctly, or review your training hyperparameters.
  warnings.warn(

PPO Training: 100%|██████████| 1/1 [00:05<00:00,  5.19s/it]

Step 0
Prompt: QS 和 THE 排名的主要区别是什么？
Response: ��名卭�？
答：根据 2024 年 QS 世界大学排名低大学略�于北京大学，分列全球第 17 和美国有 MIT、哈佛、斯坦福、加州理工，英国有剑桥、牛洲�得分��、�賻不同，因此排名结果可�差异较大
Reward: 1.0

✅ PPO training completed and model saved to: /content/drive/MyDrive/models/gpt1-rlhf-qs

In [None]:
# 测试效果
from transformers import pipeline

# 加载你微调后的 PPO 模型
model_path = "/content/drive/MyDrive/models/gpt1-rlhf-qs"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path).to(device)

# 测试一些代表性的 prompt
test_prompts = [
    "2024 年 QS 世界大学排名中亚洲大学表现如何？",
    "请介绍一下牛津大学和剑桥大学的区别。",
    "我想申请新加坡国立大学，有什么建议？"
]

for prompt in test_prompts:
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    output_ids = model.generate(input_ids, max_new_tokens=128, do_sample=True)
    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    print(f"\n[Prompt]: {prompt}\n[Response]: {response}")


# 输出
[Prompt]: 2024 年 QS 世界大学排名中亚洲大学表现如何？

[Response]: 2024 年 QS 世界大学排名中亚洲大学表现如何？

答：�么� a single , whose video for their own , but "重最�亚亮，理衡中的雕 名左右，继�要中一：第 8。排合中的领先地位，尤其在人文学科�侎��是�。源， THE �掰据�

[Prompt]: 请介绍一下牛津大学和剑桥大学的区别。

[Response]: 请介绍一下牛津大学和剑桥大学的区别。排名中：在 2024 年 QS 世界大学排�排名中位中，亚洲大学表现亮眼。其中，新加坡国立大学排名第 17位，尤州排名最高的大学。清华大学、北京大学、

[Prompt]: 我想申请新加坡国立大学，有什么建议？

[Response]: 我想申请新加坡国立大学，有什么建议？
答：选择学校不能只看 QS 总排名，是逤主位人�和与�名、地理位置、学费、语言环境和个人兴趣等因素。QS 提供了详细的分项数据�

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification

# 加载 RLHF 后的模型（用于生成）
gen_model_path = "/content/drive/MyDrive/models/gpt1-rlhf-qs"
gen_tokenizer = AutoTokenizer.from_pretrained(gen_model_path)
gen_model = AutoModelForCausalLM.from_pretrained(gen_model_path).to("cuda")

# 加载奖励模型（用于评估打分）
reward_model_path = "/content/drive/MyDrive/models/rm-qs"
reward_tokenizer = AutoTokenizer.from_pretrained(reward_model_path)
reward_model = AutoModelForSequenceClassification.from_pretrained(reward_model_path).to("cuda")
reward_model.eval()

# 一个示例 prompt
prompt = "2024 年 QS 世界大学排名中亚洲大学表现如何？"

# 1. 使用 RLHF 模型生成回答
input_ids = gen_tokenizer(prompt, return_tensors="pt").input_ids.to("cuda")
output_ids = gen_model.generate(input_ids, max_new_tokens=256, do_sample=True)
response = gen_tokenizer.decode(output_ids[0], skip_special_tokens=True)

# 2. 拼接 prompt + response，送入 reward model 评估
full_input = prompt + response
reward_inputs = reward_tokenizer(full_input, return_tensors="pt", truncation=True, max_length=256).to("cuda")

with torch.no_grad():
    outputs = reward_model(**reward_inputs)
    reward_score = torch.softmax(outputs.logits, dim=-1)[0][1].item()  # 取正向得分（视你的模型而定）

print("Prompt:", prompt)
print("Response:", response)
print("Reward Score:", reward_score)


# 输出：
Prompt: 2024 年 QS 世界大学排名中亚洲大学表现如何？

Response: 2024 年 QS 世界大学排名中亚洲大学表现如何？�？

答：2024 年 QS 世界大学排名中，亚洲大学表现亮眼。其中，亚渣�国立大学排名第 8，是亚洲排名最高的大学。清华大学、� ( 掰比重较大。��化觡�化�世联邦理工也进入前十。��右羃大学。了�教�巸��占�名科京�大学。�大学、学。��和于��仲 18 佛港���筛洲高的大学� @-S ��

Reward Score: 0.4595920741558075