In [1]:
#加速
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [2]:
# 缓存数据盘
os.environ["TRANSFORMERS_CACHE"] = "/root/autodl-tmp/hf_cache"
os.environ["HF_HOME"] = "/root/autodl-tmp/hf_home"
os.environ["HF_DATASETS_CACHE"] = "/root/autodl-tmp/hf_datasets"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:128"
os.environ["TRANSFORMERS_NO_MLX"] = "1"

In [None]:
# train_cv.py(K-FOLD)
import json
import numpy as np
import torch
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
import shutil
from sklearn.model_selection import StratifiedKFold
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    DataCollatorForTokenClassification,
)
import evaluate


#配置

@dataclass
class Config:
    model_name: str = "ku-nlp/deberta-v3-base-japanese"
    data_files: list = None  
    label2id: dict = None
    n_splits: int = 5
    random_state: int = 42
    batch_size: int = 16
    gradient_accumulation_steps: int = 8
    num_train_epochs: int = 30
    learning_rate: float = 2e-5
    weight_decay: float = 0.01
    dropout_prob: float = 0.4
    class_weights: list = None  # [1.0, 3.0] for ["O", "MET"]
    eval_steps: int = 500
    early_stopping_patience: int = 20
    fp16: bool = True
    output_dir_base: str = "./autodl-tmp/tlv_sqe"
    save_total_limit: int = 1 

    def __post_init__(self):
        if self.data_files is None:
            self.data_files = [
                ("data_ketsugou.json", "metaphor"),
                ("data_shihyou.json", "simile"),
                ("data_no.json", "non_metaphor"),
            ]
        if self.label2id is None:
            self.label2id = {"O": 0, "MET": 1}
        if self.class_weights is None:
            self.class_weights = [1.0, 3.0]


#加权损失 Trainer

class WeightedTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = torch.tensor(class_weights, dtype=torch.float32) if class_weights is not None else None

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        device = outputs.logits.device
        if self.class_weights is not None:
            weights = self.class_weights.to(device)
            loss_fct = torch.nn.CrossEntropyLoss(weight=weights, ignore_index=-100)
        else:
            loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100)
        num_labels = outputs.logits.size(-1)
        loss = loss_fct(outputs.logits.view(-1, num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss


# 清理checkpoint

def cleanup_checkpoints(output_dir):
    output_path = Path(output_dir)
    for ckpt_dir in output_path.glob("checkpoint-*"):
        print(f"🗑️ 删除中间 checkpoint: {ckpt_dir}")
        shutil.rmtree(ckpt_dir)


# 加载数据
def load_data_with_tag(filename, tag):
    with open(filename, "r", encoding="utf-8") as f:
        data = json.load(f)
    for d in data:
        d["source_tag"] = tag
    return data


# Tokenize + 对齐标签

def tokenize_and_align_labels(examples, tokenizer, label2id):
    tokenized = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding=True,
        max_length=256, 
    )
    all_labels = []
    for i, labels in enumerate(examples["labels"]):
        word_ids = tokenized.word_ids(batch_index=i)
        label_ids = []
        for wid in word_ids:
            if wid is None:
                label_ids.append(-100)
            else:
                label_ids.append(label2id[labels[wid]])
        all_labels.append(label_ids)
    tokenized["labels"] = all_labels
    return tokenized


# 评价

def build_compute_metrics(id2label):
    metric = evaluate.load("seqeval")
    def compute_metrics(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=-1)
        true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
        true_preds = [
            [id2label[p] for (p, l) in zip(pred, label) if l != -100]
            for pred, label in zip(predictions, labels)
        ]
        res = metric.compute(predictions=true_preds, references=true_labels)
        return {
            "precision": res["overall_precision"],
            "recall": res["overall_recall"],
            "f1": res["overall_f1"],
            "accuracy": res["overall_accuracy"]
        }
    return compute_metrics


# 种子

def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False



# ------------main----------------
def main():
    config = Config()
    set_seed(config.random_state)

    print("🚀 CUDA Available:", torch.cuda.is_available())
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # ---------- 1. 加载数据 ----------
    print("📥 读取语料...")
    all_data = []
    for filename, tag in config.data_files:
        data = load_data_with_tag(filename, tag)
        all_data.extend(data)
    labels_for_split = [d["source_tag"] for d in all_data]
    print(f"📊 总样本数: {len(all_data)}")

    # ---------- 2. 初始化分词器 & 标签映射 ----------
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    label2id = config.label2id
    id2label = {v: k for k, v in label2id.items()}

    # ---------- 3. 构建 tokenize 函数 ----------
    def tokenize_fn(examples):
        return tokenize_and_align_labels(examples, tokenizer, label2id)

    # ---------- 4. 构建评价函数 ----------
    compute_metrics_fn = build_compute_metrics(id2label)
    data_collator = DataCollatorForTokenClassification(tokenizer)

    # ---------- 5. K 折交叉验证 ----------
    kf = StratifiedKFold(n_splits=config.n_splits, shuffle=True, random_state=config.random_state)
    all_results = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(all_data, labels_for_split)):
        print(f"\n{'='*20} Fold {fold+1} {'='*20}")

        # 创建带时间戳的输出目录
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        output_dir = f"{config.output_dir_base}{fold+1}_{timestamp}"

        train_data = [all_data[i] for i in train_idx]
        val_data = [all_data[i] for i in val_idx]

        train_ds = Dataset.from_list(train_data).map(tokenize_fn, batched=True, batch_size=config.batch_size)
        val_ds = Dataset.from_list(val_data).map(tokenize_fn, batched=True, batch_size=config.batch_size)

        model = AutoModelForTokenClassification.from_pretrained(
            config.model_name,
            num_labels=len(label2id),
            id2label=id2label,
            label2id=label2id,
            hidden_dropout_prob=config.dropout_prob,
        )
        # ❗ 不要手动 .to(device) — Trainer 会自动处理

        training_args = TrainingArguments(
            output_dir=output_dir,
            learning_rate=config.learning_rate,
            per_device_train_batch_size=config.batch_size,
            per_device_eval_batch_size=config.batch_size // 2,
            gradient_accumulation_steps=config.gradient_accumulation_steps,
            num_train_epochs=config.num_train_epochs,
            weight_decay=config.weight_decay,
            eval_strategy="steps",
            eval_steps=config.eval_steps,
            save_strategy="steps",
            save_steps=config.eval_steps,
            logging_steps=50,
            load_best_model_at_end=True,
            metric_for_best_model="f1",
            greater_is_better=True,
            fp16=config.fp16 and torch.cuda.is_available(),
            dataloader_num_workers=2,
            report_to="none",
            save_total_limit=config.save_total_limit,  # 只保留 best
            seed=config.random_state,
        )

        trainer = WeightedTrainer(
            model=model,
            args=training_args,
            train_dataset=train_ds,
            eval_dataset=val_ds,
            tokenizer=tokenizer,
            data_collator=data_collator,
            compute_metrics=compute_metrics_fn,
            class_weights=config.class_weights,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=config.early_stopping_patience)],
        )

        print("⏳ 开始训练...")
        trainer.train()

        # 🧹 清理中间 checkpoint，只保留 best
        cleanup_checkpoints(output_dir)

        # 💾 保存最终模型和分词器
        trainer.save_model(output_dir)
        tokenizer.save_pretrained(output_dir)

        # 📈 评估
        eval_res = trainer.evaluate()
        print("✅ 评估结果:", eval_res)
        all_results.append(eval_res)

        # 📂 保存本折结果
        with open(f"{output_dir}/eval_results.json", "w", encoding="utf-8") as f:
            json.dump(eval_res, f, indent=2, ensure_ascii=False)

    # ---------- 6. 计算平均结果 ----------
    avg_precision = np.mean([r["eval_precision"] for r in all_results])
    avg_recall = np.mean([r["eval_recall"] for r in all_results])
    avg_f1 = np.mean([r["eval_f1"] for r in all_results])

    final_results = {
        "avg_precision": float(avg_precision),
        "avg_recall": float(avg_recall),
        "avg_f1": float(avg_f1),
        "fold_results": all_results
    }

    print("\n" + "="*40)
    print("🏆 最终平均结果")
    print("="*40)
    print(f"Precision: {avg_precision:.4f}")
    print(f"Recall:    {avg_recall:.4f}")
    print(f"F1:        {avg_f1:.4f}")

    # 💾 保存最终结果
    with open("final_results.json", "w", encoding="utf-8") as f:
        json.dump(final_results, f, indent=2, ensure_ascii=False)

    print("\n💾 结果已保存至: final_results.json")

if __name__ == "__main__":
    main()

In [3]:
#无k-fold
# train_single.py
import json
import numpy as np
import torch
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
import shutil
from sklearn.model_selection import train_test_split  # 👈 改用简单划分
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    DataCollatorForTokenClassification,
)
import evaluate

# ----------------------------
# ⚙️ 配置类
# ----------------------------
@dataclass
class Config:
    model_name: str = "ku-nlp/deberta-v3-base-japanese"
    data_files: list = None
    label2id: dict = None
    random_state: int = 42
    train_ratio: float = 0.8  # 训练集比例
    batch_size: int = 16
    gradient_accumulation_steps: int = 8
    num_train_epochs: int = 30  #
    learning_rate: float = 2e-5
    weight_decay: float = 0.01
    dropout_prob: float = 0.4
    class_weights: list = None
    eval_steps: int = 500
    early_stopping_patience: int = 20
    fp16: bool = True
    output_dir: str =  "./autodl-tmp/tlv_sqe/result_single"
    save_total_limit: int = 1

    def __post_init__(self):
        if self.data_files is None:
            self.data_files = [
                ("data_ketsugou.json", "metaphor"),
                ("data_shihyou.json", "simile"),
                ("data_no.json", "non_metaphor"),
            ]
        if self.label2id is None:
            self.label2id = {"O": 0, "MET": 1}
        if self.class_weights is None:
            self.class_weights = [1.0, 30.0]

# ----------------------------
# 🧠 加权损失 Trainer
# ----------------------------
class WeightedTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = torch.tensor(class_weights, dtype=torch.float32) if class_weights is not None else None

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        device = outputs.logits.device
        if self.class_weights is not None:
            weights = self.class_weights.to(device)
            loss_fct = torch.nn.CrossEntropyLoss(weight=weights, ignore_index=-100)
        else:
            loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100)
        num_labels = outputs.logits.size(-1)
        loss = loss_fct(outputs.logits.view(-1, num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# ----------------------------
# 🧹 清理中间 checkpoint
# ----------------------------
def cleanup_checkpoints(output_dir):
    output_path = Path(output_dir)
    for ckpt_dir in output_path.glob("checkpoint-*"):
        print(f"🗑️ 删除中间 checkpoint: {ckpt_dir}")
        shutil.rmtree(ckpt_dir)

# ----------------------------
# 📥 加载数据函数
# ----------------------------
def load_data_with_tag(filename, tag):
    with open(filename, "r", encoding="utf-8") as f:
        data = json.load(f)
    for d in data:
        d["source_tag"] = tag
    return data

# ----------------------------
# 🧩 Tokenize + 对齐标签
# ----------------------------
def tokenize_and_align_labels(examples, tokenizer, label2id):
    tokenized = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding=True,
        max_length=512,
    )
    all_labels = []
    for i, labels in enumerate(examples["labels"]):
        word_ids = tokenized.word_ids(batch_index=i)
        label_ids = []
        for wid in word_ids:
            if wid is None:
                label_ids.append(-100)
            else:
                label_ids.append(label2id[labels[wid]])
        all_labels.append(label_ids)
    tokenized["labels"] = all_labels
    return tokenized

# ----------------------------
# 📊 评价指标
# ----------------------------
def build_compute_metrics(id2label):
    metric = evaluate.load("seqeval")
    def compute_metrics(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=-1)
        true_labels = [[id2label[l] for l in label if l != -100] for label in labels]
        true_preds = [
            [id2label[p] for (p, l) in zip(pred, label) if l != -100]
            for pred, label in zip(predictions, labels)
        ]
        res = metric.compute(predictions=true_preds, references=true_labels)
        return {
            "precision": res["overall_precision"],
            "recall": res["overall_recall"],
            "f1": res["overall_f1"],
            "accuracy": res["overall_accuracy"]
        }
    return compute_metrics

# ----------------------------
# 🌱 设置随机种子
# ----------------------------
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# ----------------------------
# 🚀 主程序（单次训练）
# ----------------------------
def main():
    config = Config()
    set_seed(config.random_state)

    print("🚀 CUDA Available:", torch.cuda.is_available())

    # ---------- 1. 加载数据 ----------
    print("📥 读取语料...")
    all_data = []
    for filename, tag in config.data_files:
        data = load_data_with_tag(filename, tag)
        all_data.extend(data)
    print(f"📊 总样本数: {len(all_data)}")

    # ---------- 2. 划分训练集/验证集 ----------
    train_data, val_data = train_test_split(
        all_data,
        train_size=config.train_ratio,
        random_state=config.random_state,
        shuffle=True,
        stratify=[d["source_tag"] for d in all_data]  # 分层抽样，保持比例
    )
    print(f"🧮 训练集: {len(train_data)} | 验证集: {len(val_data)}")

    # ---------- 3. 初始化分词器 & 标签映射 ----------
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    label2id = config.label2id
    id2label = {v: k for k, v in label2id.items()}

    # ---------- 4. 构建 tokenize 函数 ----------
    def tokenize_fn(examples):
        return tokenize_and_align_labels(examples, tokenizer, label2id)

    # ---------- 5. 构建评价函数 ----------
    compute_metrics_fn = build_compute_metrics(id2label)
    data_collator = DataCollatorForTokenClassification(tokenizer)

    # 创建带时间戳的输出目录
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = f"{config.output_dir}_{timestamp}"

    train_ds = Dataset.from_list(train_data).map(tokenize_fn, batched=True, batch_size=config.batch_size)
    val_ds = Dataset.from_list(val_data).map(tokenize_fn, batched=True, batch_size=config.batch_size)

    model = AutoModelForTokenClassification.from_pretrained(
        config.model_name,
        num_labels=len(label2id),
        id2label=id2label,
        label2id=label2id,
        hidden_dropout_prob=config.dropout_prob,
    )

    training_args = TrainingArguments(
        output_dir=output_dir,
        learning_rate=config.learning_rate,
        per_device_train_batch_size=config.batch_size,
        per_device_eval_batch_size=config.batch_size // 2,
        gradient_accumulation_steps=config.gradient_accumulation_steps,
        num_train_epochs=config.num_train_epochs,
        weight_decay=config.weight_decay,
        eval_strategy="steps",
        eval_steps=config.eval_steps,
        save_strategy="steps",
        save_steps=config.eval_steps,
        logging_steps=50,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        fp16=config.fp16 and torch.cuda.is_available(),
        dataloader_num_workers=2,
        report_to="none",
        save_total_limit=config.save_total_limit,
        seed=config.random_state,
    )

    trainer = WeightedTrainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics_fn,
        class_weights=config.class_weights,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=config.early_stopping_patience)],
    )

    print("⏳ 开始训练...")
    trainer.train()

    # 🧹 清理中间 checkpoint
    cleanup_checkpoints(output_dir)

    # 💾 保存最终模型
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

    # 📈 最终评估
    eval_res = trainer.evaluate()
    print("\n✅ 最终评估结果:")
    for k, v in eval_res.items():
        print(f"  {k}: {v:.4f}")

    # 📂 保存结果
    with open(f"{output_dir}/eval_results.json", "w", encoding="utf-8") as f:
        json.dump(eval_res, f, indent=2, ensure_ascii=False)

    print(f"\n🎉 模型与结果已保存至: {output_dir}")

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm


🚀 CUDA Available: True
📥 读取语料...
📊 总样本数: 9185
🧮 训练集: 7348 | 验证集: 1837


Map: 100%|██████████| 7348/7348 [00:01<00:00, 4523.12 examples/s]
Map: 100%|██████████| 1837/1837 [00:00<00:00, 2670.22 examples/s]
Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at ku-nlp/deberta-v3-base-japanese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 96868, 'bos_token_id': 96871, 'pad_token_id': 96869}.


⏳ 开始训练...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
500,0.0602,0.099482,0.799458,0.917574,0.854453,0.967379
1000,0.0335,0.176725,0.869056,0.923795,0.89559,0.976272
1500,0.0233,0.149596,0.863112,0.931571,0.896036,0.976078


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

🗑️ 删除中间 checkpoint: autodl-tmp/tlv_sqe/result_single_20250917_210317/checkpoint-1500


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)





✅ 最终评估结果:
  eval_loss: 0.1496
  eval_precision: 0.8631
  eval_recall: 0.9316
  eval_f1: 0.8960
  eval_accuracy: 0.9761
  eval_runtime: 5.4234
  eval_samples_per_second: 338.7150
  eval_steps_per_second: 42.4090
  epoch: 30.0000

🎉 模型与结果已保存至: ./autodl-tmp/tlv_sqe/result_single_20250917_210317


In [33]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

# ======================
# 0. 设置设备
# ======================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ======================
# 1. 加载模型和 tokenizer
# ======================
model_name_or_path = "./autodl-tmp/tlv_sqe/result_single_20250917_210317"  # 训练好的模型路径
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForTokenClassification.from_pretrained(model_name_or_path)
model.to(device)
model.eval()

# label 映射
id2label = {0: "O", 1: "MET"}

# ======================
# 2. 预测函数
# ======================
def predict_raw_sentence(sentence: str):
    """
    输入原始句子（str），返回 List[(token, label)]
    """
    # 使用 tokenizer 自动分词
    encoding = tokenizer(
        sentence,
        return_tensors="pt",
        truncation=True,
        padding=True,
        is_split_into_words=False  
    )
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)[0].cpu().tolist()

    # 对齐 subword token
    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    result = list(zip(tokens, [id2label[p] for p in predictions]))

    return result


# 测试

sentence = "何かが燃える"
result = predict_raw_sentence(sentence)

print("预测结果：")
for token, label in result:
    print(f"{token}\t{label}")


Using device: cuda


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


预测结果：
[CLS]	O
何	O
かが	O
燃え	MET
る	MET
[SEP]	MET


In [1]:
import torch
import csv
from transformers import AutoTokenizer, AutoModelForTokenClassification

# ======================
# 0. 设置设备
# ======================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ======================
# 1. 加载模型和 tokenizer
# ======================
model_name_or_path = "./autodl-tmp/tlv_sqe/result_single_20250917_210317"  # 训练好的模型路径
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForTokenClassification.from_pretrained(model_name_or_path)
model.to(device)
model.eval()

# label 映射
id2label = {0: "O", 1: "MET"}

# ======================
# 2. 预测函数（返回 token + label 列表）
# ======================
def predict_raw_sentence(sentence: str):
    """
    输入原始句子（str），返回 List[(token, label)]
    """
    encoding = tokenizer(
        sentence,
        return_tensors="pt",
        truncation=True,
        padding=True,
        is_split_into_words=False
    )
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)[0].cpu().tolist()

    tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
    result = list(zip(tokens, [id2label.get(p, "O") for p in predictions]))
    return result

# ======================
# 3. 批量处理函数
# ======================
def process_txt_file(input_txt_path: str, output_csv_path: str):
    """
    读取 txt 文件（每行一句），预测每句中的 MET token，
    并将结果写入 CSV 文件。
    """
    with open(input_txt_path, 'r', encoding='utf-8') as f_in, \
         open(output_csv_path, 'w', newline='', encoding='utf-8') as f_out:

        writer = csv.writer(f_out)
        # 写入表头
        writer.writerow(["sentence", "token", "position_in_sentence"])

        line_num = 0
        for line in f_in:
            line_num += 1
            sentence = line.strip()
            if not sentence:
                continue  # 跳过空行

            try:
                predictions = predict_raw_sentence(sentence)
            except Exception as e:
                print(f"Error processing line {line_num}: {sentence} | {e}")
                continue

            # 遍历预测结果，找出 MET 标签的 token
            for idx, (token, label) in enumerate(predictions):
                if label == "MET":
                    # 过滤掉特殊 token 如 [CLS], [SEP], [PAD] 等
                    if token in tokenizer.all_special_tokens:
                        continue
                    # 可选：还原 ## 开头的 subword（如 "##ing"）
                    if token.startswith("##") and idx > 0:
                        # 可以选择合并，但为了简单，这里只记录原始 token
                        pass
                    writer.writerow([sentence, token, idx])

            # 可选：打印进度
            if line_num % 100 == 0:
                print(f"Processed {line_num} sentences...")

    print(f"✅ Processing complete. Results saved to {output_csv_path}")


# ======================
# 4. 主程序入口
# ======================
if __name__ == "__main__":
    INPUT_TXT = "abe_speech_5000.txt"   # 替换为你的输入文件路径
    OUTPUT_CSV = "met_tokens_output.csv"

    process_txt_file(INPUT_TXT, OUTPUT_CSV)

Using device: cuda


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Processed 100 sentences...
Processed 200 sentences...
Processed 300 sentences...
Processed 400 sentences...
Processed 500 sentences...
Processed 600 sentences...
Processed 700 sentences...
Processed 800 sentences...
Processed 900 sentences...
Processed 1000 sentences...
Processed 1100 sentences...
Processed 1200 sentences...
Processed 1300 sentences...
Processed 1400 sentences...
Processed 1500 sentences...
Processed 1600 sentences...
Processed 1700 sentences...
Processed 1800 sentences...
Processed 1900 sentences...
Processed 2000 sentences...
Processed 2100 sentences...
Processed 2200 sentences...
Processed 2300 sentences...
Processed 2400 sentences...
Processed 2500 sentences...
Processed 2600 sentences...
Processed 2700 sentences...
Processed 2800 sentences...
Processed 2900 sentences...
Processed 3000 sentences...
Processed 3100 sentences...
Processed 3200 sentences...
Processed 3300 sentences...
Processed 3400 sentences...
Processed 3500 sentences...
Processed 3600 sentences...
P