# 构造 `LlamaFactory` 可用的 SFT 数据（Alpaca JSONL）

训练时用到的 5 个 Dataset：

- `SidSFTDataset`
- `SidItemFeatDataset`
- `FusionSeqRecDataset`
- `SFTData`
- `TitleHistory2SidSFTDataset`

MiniOneRec 这些 Dataset 本来返回的是 `input_ids/labels`。
现在改成


```json
{
    "instruction": "...", 
    "input": "...", 
    "output": "..."
}
```


In [1]:
import os, sys, json, random
from pathlib import Path
from typing import Dict, Any, List, Optional, Tuple

import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM

import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from data import (
    SFTData,
    SidSFTDataset,
    SidItemFeatDataset,
    FusionSeqRecDataset,
    TitleHistory2SidSFTDataset,
)

from sft import TokenExtender


In [3]:
base_model="../llms/Qwen/Qwen3-0.6B"
SFT_DATA_ROOT="./data/Amazon2018"
BSZ=32
MICRO_BSZ=4

seed=42
cutoff_len=612
sample=-1

WANDB_PROJ="minionerec"
OUTPUT_DIR="output_sft/sft_base"
SID_METHOD="rqvae"

category="Industrial_and_Scientific"  # "Office_Products"   
CATEGORY=category

train_file=f"{SFT_DATA_ROOT}/{CATEGORY}/train/{CATEGORY}_convert.csv"
eval_file=f"{SFT_DATA_ROOT}/{CATEGORY}/valid/{CATEGORY}_convert.csv"
test_file=f"{SFT_DATA_ROOT}/{CATEGORY}/test/{CATEGORY}_convert.csv"
info_file=f"{SFT_DATA_ROOT}/{CATEGORY}/info/{CATEGORY}_convert.txt"

sid_index_path=f"{SFT_DATA_ROOT}/{CATEGORY}/{CATEGORY}.index.json"
item_meta_path=f"{SFT_DATA_ROOT}/{CATEGORY}/{CATEGORY}.item.json"

In [4]:
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"


model = AutoModelForCausalLM.from_pretrained(
            base_model,
            # torch_dtype=torch.bfloat16,
            dtype=torch.bfloat16,
        )

if sid_index_path and os.path.exists(sid_index_path):
    print(f"Loading index from {sid_index_path}")
    token_extender = TokenExtender(
        data_path=os.path.dirname(sid_index_path),
        dataset=os.path.basename(sid_index_path).split('.')[0]
    )
    new_tokens = token_extender.get_new_tokens()
    if new_tokens:
        print(f"Adding {len(new_tokens)} new tokens to tokenizer")
        tokenizer.add_tokens(new_tokens)
        model.resize_token_embeddings(len(tokenizer))

Loading index from ./data/Amazon2018/Industrial_and_Scientific/Industrial_and_Scientific.index.json
Adding 590 new tokens to tokenizer


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [5]:
train_data1 = SidSFTDataset(train_file=train_file, tokenizer=tokenizer, max_len=cutoff_len,  sample=sample, seed=seed, category=category)

train_data2 = SidItemFeatDataset(item_file=item_meta_path, index_file=sid_index_path, tokenizer=tokenizer, max_len=cutoff_len,  sample=sample, seed=seed, category=category)

train_data3 = FusionSeqRecDataset(train_file=train_file, item_file=item_meta_path, index_file=sid_index_path, tokenizer=tokenizer, max_len=cutoff_len, sample=sample, seed=seed, category=category)

train_data4 = SFTData(train_file=train_file, tokenizer=tokenizer, max_len=cutoff_len,  sample=sample, seed=seed, category=category)

train_data5 = TitleHistory2SidSFTDataset(train_file=train_file, item_file=item_meta_path, index_file=sid_index_path, tokenizer=tokenizer, max_len=cutoff_len, sample=sample, seed=seed, category=category)

# train_data = ConcatDataset(train_datasets)
val_data = SidSFTDataset(train_file=eval_file, tokenizer=tokenizer, max_len=cutoff_len,  sample=sample, seed=seed, category=category)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row['history_item_sid'] = eval(row['history_item_sid'])
100%|██████████| 33185/33185 [00:16<00:00, 1986.62it/s]
100%|██████████| 6531/6531 [00:01<00:00, 3389.03it/s]
100%|██████████| 33185/33185 [00:15<00:00, 2077.17it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row['history_item_title'] = eval(row['history_item_title'])
 26%|██▋       | 8720/33185 [00:06<00:17, 1367.06it/s]

641


 55%|█████▍    | 18236/33185 [00:14<00:11, 1300.55it/s]

612


 73%|███████▎  | 24323/33185 [00:18<00:07, 1258.84it/s]

629


100%|██████████| 33185/33185 [00:26<00:00, 1262.27it/s]


620


100%|██████████| 33185/33185 [00:22<00:00, 1483.71it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row['history_item_sid'] = eval(row['history_item_sid'])
100%|██████████| 4148/4148 [00:02<00:00, 1853.09it/s]


In [6]:
print(train_data1.pre_alpaca(1))
print(train_data2.pre_alpaca(1))
print(train_data3.pre_alpaca(1))
print(train_data4.pre_alpaca(1))
print(train_data5.pre_alpaca(1))
print(val_data.pre_alpaca(1))

{'instruction': 'Can you predict the next possible item that the user may expect?', 'input': 'The user has interacted with items <a_40><b_116><c_254>, <a_120><b_233><c_163> in chronological order. Can you predict the next possible item that the user may expect?', 'output': '<a_120><b_135><c_223>'}
{'instruction': 'Answer the question about item identification.', 'input': 'What is the title of item "<a_191><b_110><c_145>"?', 'output': 'Stanley TRA708T Sharpshooter 1/2-Inch Leg Length Staples, Steel (1000 Count)'}
{'instruction': 'Can you recommend the next item for the user based on their interaction history?', 'input': 'The user has sequentially interacted with items <a_40><b_116><c_254>, <a_120><b_233><c_163>. Can you recommend the next item for him? Tell me the title of the item', 'output': 'Elenco  Resistor Substitution Box - RS-400'}
{'instruction': "Write a response that appropriately completes the request. \nIn relation to the user's recent entertainment with a given Industrial_a

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row['history_item_title'] = eval(row['history_item_title'])


train_data1 = SidSFTDataset(train_file=train_file, tokenizer=tokenizer, max_len=cutoff_len,  sample=sample, seed=seed, category=category)

train_data2 = SidItemFeatDataset(item_file=item_meta_path, index_file=sid_index_path, tokenizer=tokenizer, max_len=cutoff_len,  sample=sample, seed=seed, category=category)

train_data3 = FusionSeqRecDataset(train_file=train_file, item_file=item_meta_path, index_file=sid_index_path, tokenizer=tokenizer, max_len=cutoff_len, sample=sample, seed=seed, category=category)

train_data4 = SFTData(train_file=train_file, tokenizer=tokenizer, max_len=cutoff_len,  sample=sample, seed=seed, category=category)

train_data5 = TitleHistory2SidSFTDataset(train_file=train_file, item_file=item_meta_path, index_file=sid_index_path, tokenizer=tokenizer, max_len=cutoff_len, sample=sample, seed=seed, category=category)

val_data = SidSFTDataset(train_file=eval_file, tokenizer=tokenizer, max_len=cutoff_len,  sample=sample, seed=seed, category=category)

In [None]:
train_sid_sft = train_data1.get_alpaca()
print(len(train_sid_sft))

train_sid_item_feat = train_data2.get_alpaca()
print(len(train_sid_item_feat))

train_fusion_seq_rec = train_data3.get_alpaca()
print(len(train_fusion_seq_rec))

train_sft = train_data4.get_alpaca()
print(len(train_sft))

train_title_history2sid_sft = train_data5.get_alpaca()
print(len(train_title_history2sid_sft))

# 139271

val_sid_sft = val_data.get_alpaca()
print(len(val_sid_sft))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row['history_item_sid'] = eval(row['history_item_sid'])
100%|██████████| 33185/33185 [00:03<00:00, 9301.96it/s]


33185


100%|██████████| 6531/6531 [00:00<00:00, 1006244.70it/s]


6531


100%|██████████| 33185/33185 [00:01<00:00, 16694.77it/s]


33185


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  row['history_item_title'] = eval(row['history_item_title'])
100%|██████████| 33185/33185 [00:04<00:00, 7919.64it/s]


33185


100%|██████████| 33185/33185 [00:02<00:00, 15940.51it/s]


33185


100%|██████████| 4148/4148 [00:00<00:00, 8924.78it/s]

4148





In [11]:
# Cell 4: plug your datasets here
DATASETS_TRAIN = {
    "train_sid_sft": train_sid_sft,
    "train_sid_item_feat": train_sid_item_feat,
    "train_fusion_seq_rec": train_fusion_seq_rec,
    "train_sft": train_sft,
    "train_title_history2sid_sft": train_title_history2sid_sft,
}

DATASETS_EVAL = {
    "val_sid_sft": val_sid_sft,
}

LIMIT_PER_DATASET = None  # e.g. 20000

In [12]:
OUT_DIR = Path("./sft_data")   # <<< 改这里
OUT_DIR.mkdir(parents=True, exist_ok=True)
saved_paths = {}

# train
for name, ds in DATASETS_TRAIN.items():
    out_path = OUT_DIR / f"{name}.jsonl"

    print(f"[OK] {name}: -> {out_path}")

    with open(out_path, "w", encoding="utf-8") as f:
        for ex in ds:
            ex["instruction"] = ex["instruction"].strip()
            ex["input"] = ex["input"].strip()
            ex["output"] = ex["output"].strip()
            f.write(json.dumps(ex, ensure_ascii=False) + "\n")
# eval
for name, ds in DATASETS_EVAL.items():
    out_path = OUT_DIR / f"{name}.jsonl"
    print(f"[OK] {name}: -> {out_path}")
    with open(out_path, "w", encoding="utf-8") as f:
        for ex in ds:
            ex["instruction"] = ex["instruction"].strip()
            ex["input"] = ex["input"].strip()
            ex["output"] = ex["output"].strip()
            f.write(json.dumps(ex, ensure_ascii=False) + "\n")

[OK] train_sid_sft: -> sft_data/train_sid_sft.jsonl
[OK] train_sid_item_feat: -> sft_data/train_sid_item_feat.jsonl
[OK] train_fusion_seq_rec: -> sft_data/train_fusion_seq_rec.jsonl
[OK] train_sft: -> sft_data/train_sft.jsonl
[OK] train_title_history2sid_sft: -> sft_data/train_title_history2sid_sft.jsonl
[OK] val_sid_sft: -> sft_data/val_sid_sft.jsonl
