# 构造 `VeRL` 可用的 RL 数据

RL 训练时用到的 3 个 Dataset：

- `SidDataset`
- `RLTitle2SidDataset`
- `RLSeqTitle2SidDataset`

MiniOneRec 这些 Dataset 本来返回的是 `input_ids/labels`。
现在改成

目标输出：
- `train.parquet`
- `val.parquet`

每条样本（row）建议字段（schema）如下：
- `data_source`: string（子任务名，如 `sid` / `title2sid` / `seq_title2sid`）  
- `prompt`: list[dict]（HF chat template 消息列表：`[{role, content}, ...]`）  
- `ability`: string（任务大类，如 `rec`）  
- `reward_model`: dict（至少包含 `ground_truth`；也可带 `style` 等）  
- `extra_info`: dict（你想透传到 reward function 的额外信息）

> 在 veRL 的自定义 reward 中，会拿到：`data_source, solution_str, ground_truth, extra_info`，因此务必把 reward 所需信息放进 `reward_model.ground_truth` 或 `extra_info`。


In [None]:
import os, sys, json, random
from pathlib import Path
from typing import Dict, Any, List, Optional, Tuple

import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM

import torch

In [None]:
from data import (
    SidDataset, 
    RLTitle2SidDataset, 
    RLSeqTitle2SidDataset
)

In [None]:
base_model="../llms/Qwen/Qwen3-0.6B"
SFT_DATA_ROOT="./data/Amazon2018"
BSZ=32
MICRO_BSZ=4

seed=42
cutoff_len=612
sample=-1

category="Industrial_and_Scientific"  # "Office_Products"   
CATEGORY=category

category_dict = {"Industrial_and_Scientific": "industrial and scientific items", "Office_Products": "office products", "Toys_and_Games": "toys and games", "Sports": "sports and outdoors", "Books": "books"}

train_file=f"{SFT_DATA_ROOT}/{CATEGORY}/train/{CATEGORY}_convert.csv"
eval_file=f"{SFT_DATA_ROOT}/{CATEGORY}/valid/{CATEGORY}_convert.csv"
test_file=f"{SFT_DATA_ROOT}/{CATEGORY}/test/{CATEGORY}_convert.csv"
info_file=f"{SFT_DATA_ROOT}/{CATEGORY}/info/{CATEGORY}_convert.txt"

sid_index_path=f"{SFT_DATA_ROOT}/{CATEGORY}/{CATEGORY}.index.json"
item_meta_path=f"{SFT_DATA_ROOT}/{CATEGORY}/{CATEGORY}.item.json"

In [None]:
train_data1 = SidDataset(train_file, category=category_dict[category], sample=sample)

train_data2 = RLTitle2SidDataset(item_file=item_meta_path, index_file=sid_index_path, category=category_dict[category], sample=sample)

train_data3 = RLSeqTitle2SidDataset(train_file, category=category_dict[category], sample=10000)

# train_data = ConcatDataset(train_datasets)
eval_data = SidDataset(eval_file, category=category_dict[category], sample=sample)

In [None]:
print(train_data1.pre_alpaca(1))
print(train_data2.pre_alpaca(1))
print(train_data3.pre_alpaca(1))
print(eval_data.pre_alpaca(1))

In [None]:
train_rl_sid = train_data1.get_alpaca()
print(len(train_rl_sid))

train_rl_title2sid = train_data2.get_alpaca()
print(len(train_rl_title2sid))

train_rl_seqtitle2sid = train_data3.get_alpaca()
print(len(train_rl_seqtitle2sid))

eval_rl_sid = eval_data.get_alpaca()
print(len(eval_rl_sid))

In [None]:
# plug datasets here
DATASETS_TRAIN = {
    "train_rl_sid": train_rl_sid,
    "train_rl_title2sid": train_rl_title2sid,
    "train_rl_seqtitle2sid": train_rl_seqtitle2sid,
}

DATASETS_EVAL = {
    "eval_rl_sid": eval_rl_sid,
}

LIMIT_PER_DATASET = None  # e.g. 20000

In [None]:
OUT_DIR = Path("./rl_data")   # <<< 改这里
OUT_DIR.mkdir(parents=True, exist_ok=True)
saved_paths = {}

# train
for name, ds in DATASETS_TRAIN.items():
    out_path = OUT_DIR / f"{name}.jsonl"

    print(f"[OK] {name}: -> {out_path}")

    with open(out_path, "w", encoding="utf-8") as f:
        for ex in ds:
            ex["instruction"] = ex["instruction"].strip()
            ex["input"] = ex["input"].strip()
            ex["output"] = ex["output"].strip()
            f.write(json.dumps(ex, ensure_ascii=False) + "\n")
# eval
for name, ds in DATASETS_EVAL.items():
    out_path = OUT_DIR / f"{name}.jsonl"
    print(f"[OK] {name}: -> {out_path}")
    with open(out_path, "w", encoding="utf-8") as f:
        for ex in ds:
            ex["instruction"] = ex["instruction"].strip()
            ex["input"] = ex["input"].strip()
            ex["output"] = ex["output"].strip()
            f.write(json.dumps(ex, ensure_ascii=False) + "\n")