## 下载数据

In [None]:
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

from huggingface_hub import notebook_login

notebook_login()

In [None]:
from datasets import load_dataset

reasoning_dataset = load_dataset("unsloth/OpenMathReasoning-mini", split = "cot",cache_dir='./data/reason')

non_reasoning_dataset = load_dataset("mlabonne/FineTome-100k", split = "train",cache_dir='./data/no_reason')

## 处理成llama-facctory需要的格式

In [None]:
import random
from datasets import Dataset
import json

# 处理推理数据集（带推理链）
def process_reason_data(ds, num_samples, start_idx=0):
    samples = ds.select(range(start_idx, start_idx + num_samples))
    processed = []
    for item in samples:
        # 将chain_of_thought用<think>标签包裹，然后与output拼接
        cot = "\n".join(item["chain_of_thought"])
        output = f"<think>{cot}</think>\n{item['output']}"
        processed.append({
            "instruction": item["input"],
            "input": "",
            "output": output
        })
    return processed

# 处理非推理数据集
def process_no_reason_data(ds, num_samples, start_idx=0):
    samples = ds.select(range(start_idx, start_idx + num_samples))
    processed = []
    for item in samples:
        processed.append({
            "instruction": item["question"]+'<think>',
            "input": "",
            "output": item["answer"]
        })
    return processed

# 创建推理训练集（前5000条）
train_reason = process_reason_data(ds_reason, 5000)
reason_train_dataset = Dataset.from_list(train_reason)

# 创建非推理训练集（前1000条）
train_no_reason = process_no_reason_data(ds_no_reason, 1000)
no_reason_train_dataset = Dataset.from_list(train_no_reason)

# 创建测试集（不重叠的数据）
test_reason = process_reason_data(ds_reason, 30, 5000)  # 取5000-5029
test_no_reason = process_no_reason_data(ds_no_reason, 20, 1000)  # 取1000-1019
test_data = test_reason + test_no_reason
random.shuffle(test_data)  # 打乱顺序
test_dataset = Dataset.from_list(test_data)

# 保存为三个独立的JSON文件
def save_to_json(dataset, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(dataset.to_list(), f, indent=2, ensure_ascii=False)

save_to_json(reason_train_dataset, "reason_train.json")
save_to_json(no_reason_train_dataset, "no_reason_train.json")
save_to_json(test_dataset, "mixed_test.json")

print("文件保存完成！")
print(f"推理训练集: reason_train.json ({len(reason_train_dataset)}条)")
print(f"非推理训练集: no_reason_train.json ({len(no_reason_train_dataset)}条)")
print(f"混合测试集: mixed_test.json ({len(test_dataset)}条)")

## 登记数据集

在dataset_info.json中添加

"reason_train": {
    "file_name": "reason_train.json"
  },
  "no_reason_train": {
    "file_name": "no_reason_train.json"
  },
  "mixed_test": {
    "file_name": "mixed_test.json"
  },

## 打开llama factory判断是否可以识别到数据

{   
    "predict_bleu-4": 59.40273148148148,
    "predict_model_preparation_time": 0.0063,
    "predict_rouge-1": 63.98625370370372,
    "predict_rouge-2": 45.503085185185185,
    "predict_rouge-l": 50.883192592592586,
    "predict_runtime": 274.848,
    "predict_samples_per_second": 0.182,
    "predict_steps_per_second": 0.033
}

{
    "predict_bleu-4": 32.704962962962966,
    "predict_model_preparation_time": 0.0066,
    "predict_rouge-1": 41.14385000000001,
    "predict_rouge-2": 17.662503703703706,
    "predict_rouge-l": 21.229255555555554,
    "predict_runtime": 304.883,
    "predict_samples_per_second": 0.164,
    "predict_steps_per_second": 0.03
}