In [None]:
# 导入必要的库
import torch
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer
from peft import LoraConfig, TaskType, get_peft_model

In [None]:
# 加载数据
df = pd.read_json('./metro_qa_data.json') 
ds = Dataset.from_pandas(df)

# 加载 tokenizer
path = './IEITYuan/Yuan2-2B-Mars-hf'
tokenizer = AutoTokenizer.from_pretrained(path, add_eos_token=False, add_bos_token=False, eos_token='<eod>')
tokenizer.add_tokens(['<sep>', '<pad>', '<mask>', '<predict>', '<FIM_SUFFIX>', '<FIM_PREFIX>', '<FIM_MIDDLE>', '<commit_before>', '<commit_msg>', '<commit_after>', '<jupyter_start>', '<jupyter_text>', '<jupyter_code>', '<jupyter_output>', '<empty_output>'], special_tokens=True)
tokenizer.pad_token = tokenizer.eos_token

# 数据处理函数
def process_func(example):
    MAX_LENGTH = 384
    question = tokenizer(f"{example['question']}<sep>")
    answer = tokenizer(f"{example['answer']}<eod>")
    
    input_ids = question["input_ids"] + answer["input_ids"]
    attention_mask = [1] * len(input_ids)
    labels = [-100] * len(question["input_ids"]) + answer["input_ids"]  # 问题部分不计算loss

    if len(input_ids) > MAX_LENGTH:  # 截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

# 处理数据集
tokenized_id = ds.map(process_func, remove_columns=ds.column_names)

# 检查数据
print(tokenizer.decode(tokenized_id[0]['input_ids']))
print(tokenizer.decode([id for id in tokenized_id[0]["labels"] if id != -100]))

In [None]:
# 加载模型并配置LoRA
model = AutoModelForCausalLM.from_pretrained(path, device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True)
model.enable_input_require_grads()

# LoRA 配置
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, 
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1
)

# 构建PeftModel
model = get_peft_model(model, config)
model.print_trainable_parameters()

# 设置训练参数
args = TrainingArguments(
    output_dir="./output/Yuan2.0-2B_lora_bf16",
    per_device_train_batch_size=12,
    gradient_accumulation_steps=1,
    logging_steps=1,
    save_strategy="epoch",
    num_train_epochs=3,
    learning_rate=5e-5,
    save_on_each_node=True,
    gradient_checkpointing=True,
    bf16=True
)

# 初始化Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_id,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

# 开始训练
trainer.train()

In [None]:
# 定义生成函数
def generate(question):
    prompt = f"{question}<sep>"
    inputs = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda")
    outputs = model.generate(inputs, do_sample=False, max_length=256)
    output = tokenizer.decode(outputs[0])
    print(output.split("<sep>")[-1])

# 测试生成
test_question = "如何查询地铁票价？"
generate(test_question)
