In [2]:
#学术加速
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [3]:
#设置 Hugging Face 缓存到数据盘
os.environ["TRANSFORMERS_CACHE"] = "/root/autodl-tmp/hf_cache"
os.environ["HF_HOME"] = "/root/autodl-tmp/hf_home"
os.environ["HF_DATASETS_CACHE"] = "/root/autodl-tmp/hf_datasets"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:128"

In [5]:
# ====================
# 1. 数据准备（CSV 版 + 分层划分）

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
import evaluate
import numpy as np
df = pd.read_csv("data_slv_9532.csv")   # ← 改成你的 CSV 文件名

# ① 先按原始三标签 0/1/2 分层划分
train_df, val_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df["label"]
)

# ② 把 1/2 合并成 1，0 保持 0
def remap_type(x):
    return 0 if x == 0 else 1

train_df["label"] = train_df["label"].apply(remap_type)
val_df["label"]   = val_df["label"].apply(remap_type)

# ③ 只要两列
train_df = train_df[["sentence", "label"]]
val_df   = val_df[["sentence", "label"]]

# ====================
# 2. 以下完全沿用你之前能跑的 tokenize / 训练逻辑
# ====================


model_name = "ku-nlp/deberta-v3-base-japanese"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(batch["sentence"], truncation=True, padding="max_length", max_length=128)

train_dataset = Dataset.from_pandas(train_df).map(tokenize, batched=True)
val_dataset   = Dataset.from_pandas(val_df).map(tokenize, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format(type="torch",   columns=["input_ids", "attention_mask", "label"])

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy.compute(predictions=preds, references=labels)
    f1_score = f1.compute(predictions=preds, references=labels, average="macro")
    return {"accuracy": acc["accuracy"], "f1": f1_score["f1"]}

training_args = TrainingArguments(
    output_dir="autodl-tmp/slv_md",
    eval_strategy="steps",
    eval_steps=100,
    logging_steps=50,
    save_strategy="steps",
    save_steps=500,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
trainer.train()

Map: 100%|██████████| 7625/7625 [00:00<00:00, 12319.22 examples/s]
Map: 100%|██████████| 1907/1907 [00:00<00:00, 12344.22 examples/s]
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at ku-nlp/deberta-v3-base-japanese and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 96868, 'bos_token_id': 96871, 'pad_token_id': 96869}.


Step,Training Loss,Validation Loss,Accuracy,F1
100,0.0811,0.073618,0.982695,0.979044
200,0.0973,0.071021,0.976403,0.971919
300,0.0518,0.04078,0.993183,0.9917
400,0.0918,0.093354,0.979549,0.975516
500,0.0096,0.044154,0.991085,0.989158
600,0.0232,0.034503,0.994232,0.99297
700,0.0216,0.030731,0.994232,0.993007
800,0.0063,0.061435,0.99161,0.989769
900,0.0163,0.031216,0.993183,0.991745
1000,0.0138,0.032255,0.992134,0.990475


TrainOutput(global_step=1431, training_loss=0.04280948741224827, metrics={'train_runtime': 157.7568, 'train_samples_per_second': 145.002, 'train_steps_per_second': 9.071, 'total_flos': 1504693332288000.0, 'train_loss': 0.04280948741224827, 'epoch': 3.0})

In [10]:
#单句测试
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model_dir = "autodl-tmp/slv_md/checkpoint-1431"  
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSequenceClassification.from_pretrained(model_dir)

sentence = ""
inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512, padding=True)

with torch.no_grad():
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    pred = torch.argmax(probs, dim=-1).item()

print("预测结果:", "隐喻" if pred == 1 else "非隐喻")
print("概率分布:", probs.tolist())

预测结果: 隐喻
概率分布: [[0.007236489560455084, 0.9927634596824646]]


In [4]:
#批量推理
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm  # 进度条

# 读取数据
df = pd.read_excel("pro時間BCCWJ.xlsx")  # 确保有 sentence 列

# 加载模型和 tokenizer
model_dir = "autodl-tmp/slv_md/checkpoint-1431"  
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
model.eval()

preds = []
probs_list = []

# 批处理推理
batch_size = 16  # 可以根据显存调整
sentences = df["sentence"].astype(str).tolist()

for i in tqdm(range(0, len(sentences), batch_size)):
    batch = sentences[i:i+batch_size]
    inputs = tokenizer(batch, return_tensors="pt", truncation=True,
                       max_length=512, padding=True).to(model.device)

    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)

    batch_preds = torch.argmax(probs, dim=-1).cpu().tolist()
    preds.extend(batch_preds)
    probs_list.extend(probs.cpu().tolist())

# 保存结果
df["prediction"] = ["メタファー" if p == 1 else "非メタファー" for p in preds]
df["probabilities"] = probs_list

df.to_excel("predicted.xlsx", index=False)
print("推理完成，结果已保存到 predicted.xlsx")


100%|██████████| 4238/4238 [14:31<00:00,  4.86it/s]


推理完成，结果已保存到 predicted.xlsx
