In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,7"
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForSeq2Seq,
)
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# -------------------------
# 配置
# -------------------------
MODEL_NAME = "../models/Qwen2.5-1.5B-Instruct"
DATA_PATH = "../datasets/PDTB2_level1_split/train.jsonl"
DEV_PATH = "../datasets/PDTB2_level1_split/dev.jsonl"
TEST_PATH = "../datasets/PDTB2_level1_split/test.jsonl"
OUTPUT_DIR = "../checkpoints/qwen2.5b_lora_pdtb2"
BATCH_SIZE = 1
EPOCHS = 4
LR = 3e-5
MAX_LEN = 512

# -------------------------
# 加载 tokenizer 和 base 模型
# -------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    device_map="auto"  # 自动分配到多卡
)

# -------------------------
# LoRA 配置
# -------------------------
lora_config = LoraConfig(
    r=8,                        # LoRA rank（8是性价比最优）
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],  # 对注意力权重做低秩适配
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",      # 自回归语言模型
)

# 准备模型
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # 打印可训练参数比例

# -------------------------
# 数据集处理
# -------------------------
def preprocess(example):
    prompt = f"文本1: {example['arg1']}\n关系: {example['relation']}\n文本2: {example['arg2']}\n输出:"
    tokenized = tokenizer(prompt, truncation=True, max_length=MAX_LEN)
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

dataset = load_dataset("json", data_files={
    "train": DATA_PATH,
    "validation": DEV_PATH,
    "test": TEST_PATH
})
tokenized_datasets = dataset.map(preprocess, batched=False)
data_collator = DataCollatorForSeq2Seq(tokenizer, return_tensors="pt")

# -------------------------
# 评估函数
# -------------------------
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    mask = labels != -100
    preds = preds[mask]
    labels = labels[mask]
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='macro')
    return {"accuracy": acc, "f1": f1}

# -------------------------
# 训练参数
# -------------------------
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    bf16=True,                      # 使用 bf16 精度
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_steps=50,
    save_total_limit=3,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    report_to="none",
)

# -------------------------
# Trainer
# -------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# -------------------------
# 开始训练
# -------------------------
trainer.train()

# -------------------------
# 使用最佳模型在测试集上评估
# -------------------------
print("\nEvaluating best LoRA model on test set...")
test_metrics = trainer.evaluate(tokenized_datasets["test"])
print(test_metrics)

# 保存 LoRA adapter
model.save_pretrained(os.path.join(OUTPUT_DIR, "best_lora_adapter"))


In [2]:
# -------------------------
# 开始训练
# -------------------------
trainer.train()
# -------------------------
# 使用最佳模型在测试集上评估
# -------------------------
print("\nEvaluating best model on test set...")
test_metrics = trainer.evaluate(tokenized_datasets["test"])
print(test_metrics)

# 保存最佳模型
trainer.save_model(os.path.join(OUTPUT_DIR, "best_model"))

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1!

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score, classification_report
from tqdm import tqdm

# -------------------------
# 配置
# -------------------------
MODEL_CHECKPOINT = "../checkpoints/qwen2.5b_sft_pdtb2/checkpoint-4115"  # checkpoint 路径
DATA_PATH = "../datasets/PDTB2_level1_split/test.jsonl"
MAX_NEW_TOKENS = 20  # 生成预测文本长度
MAX_LEN = 512

# -------------------------
# 加载模型和 tokenizer
# -------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_CHECKPOINT,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16
).to("cuda").eval()

# -------------------------
# 加载测试集
# -------------------------
dataset = load_dataset("json", data_files={"test": DATA_PATH})["test"]

# -------------------------
# 推理函数
# -------------------------
def predict_relation(arg1, arg2):
    prompt = f"文本1: {arg1}\n文本2: {arg2}\n请判断两句话之间的语篇关系类别（Expansion, Contingency, Comparison, Temporal）:"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=MAX_LEN).to("cuda")
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # 只提取类别关键词
    for label in ["Expansion", "Contingency", "Comparison", "Temporal"]:
        if label.lower() in prediction.lower():
            return label
    return "Unknown"

# -------------------------
# 计算预测
# -------------------------
y_true, y_pred = [], []
for sample in tqdm(dataset, desc="Evaluating"):
    pred = predict_relation(sample["arg1"], sample["arg2"])
    y_pred.append(pred)
    y_true.append(sample["relation"])

# -------------------------
# 计算指标
# -------------------------
# 过滤 Unknown（可选）
valid_idx = [i for i, p in enumerate(y_pred) if p != "Unknown"]
y_true = [y_true[i] for i in valid_idx]
y_pred = [y_pred[i] for i in valid_idx]

acc = accuracy_score(y_true, y_pred)
f1_macro = f1_score(y_true, y_pred, average="macro")

print(f"Accuracy: {acc:.4f}")
print(f"Macro-F1: {f1_macro:.4f}")
print("\n详细分类报告:")
print(classification_report(y_true, y_pred, digits=4))


Generating test split: 0 examples [00:00, ? examples/s]

Evaluating: 100%|██████████| 20/20 [00:11<00:00,  1.73it/s]

Accuracy: 0.5000
Macro-F1: 0.1667

详细分类报告:
              precision    recall  f1-score   support

  Comparison     0.0000    0.0000    0.0000         2
 Contingency     0.0000    0.0000    0.0000         5
   Expansion     0.5000    1.0000    0.6667        10
    Temporal     0.0000    0.0000    0.0000         3

    accuracy                         0.5000        20
   macro avg     0.1250    0.2500    0.1667        20
weighted avg     0.2500    0.5000    0.3333        20




  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
