In [None]:
# import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import json
import torch
from torch.utils.data import Dataset
from transformers import DebertaV2Tokenizer

class MSEDataset(Dataset):
    """用于回归评分任务的WSD数据集"""
    
    def __init__(self, json_path, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.samples = []
        
        # 1. 读取JSON
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        # 2. 构造样本：每个原始条目（上下文/义项对）作为一个样本
        for key, item in data.items():
            
            # --- 文本信息 ---
            homonym = item["homonym"]
            definition = item["judged_meaning"]
            example = item["example_sentence"]
            # 完整上下文
            context = f"{item['precontext']} {item['sentence']} {item['ending']}"
            
            # --- 标签信息 ---
            # 直接使用平均值 (avg) 作为回归目标 T
            target_avg = item["average"] 
            # 使用标准差 (stdev) 作为损失函数中的容忍度 sigma
            target_stdev = item["stdev"]
            
            # 确保 avg 和 stdev 是有效的浮点数
            if target_avg is None or target_stdev is None:
                continue 
                
            self.samples.append({
                "homonym": homonym,
                "definition": definition,
                "example": example,
                "context": context,
                "target_avg": target_avg,   # 平均分 (T)
                "target_stdev": target_stdev, # 标准差 (sigma)
                "sample_id": item['sample_id'] # 原始ID
            })
            
        print(f"创建了 {len(self.samples)} 个回归训练样本")
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        sample = self.samples[idx]
        
        # 构造输入文本
        text_parts = (
            f"homonym：{sample['homonym']}",
            f"Definition:{sample['definition']}",
            f"Example:{sample['example']}",
            f"Context:{sample['context']}"
        )
        # 使用tokenizer的sep_token连接各个部分
        text = self.tokenizer.sep_token.join(text_parts)
        
        # Tokenize
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        
        # 移除batch维度
        encoding = {k: v.squeeze(0) for k, v in encoding.items()}
        
        # 1. average 作为 labels (T)
        encoding["labels"] = torch.tensor(sample["target_avg"], dtype=torch.float32)
        
        # 2. stdev 作为 stdevs (sigma)，用于自定义损失函数
        
        # 自定义时加重惩罚区间外的，均方误差（MSELoss时不需要）
        # encoding["stdevs"] = torch.tensor(sample["target_stdev"], dtype=torch.float32)
        
        if "token_type_ids" in encoding:
            del encoding["token_type_ids"]
            
        return encoding

In [None]:
import torch.nn as nn
from transformers import DebertaV2Model, DebertaV2PreTrainedModel
import torch

# 输出维度为 1
NUM_OUTPUTS = 1 

class DebertaV2ForWSDScoring(DebertaV2PreTrainedModel):
    
    def __init__(self, config):
        super().__init__(config)
        self.deberta = DebertaV2Model(config)
        
        # 回归头，输出维度为 1
        self.regressor = nn.Sequential(
            nn.Dropout(config.hidden_dropout_prob), 
            nn.Linear(config.hidden_size, NUM_OUTPUTS) 
        )
        self.post_init() 

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        outputs = self.deberta(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )
        
        cls_output = outputs[0][:, 0, :]
        predictions = self.regressor(cls_output) 

        loss = None
        if labels is not None:
            # 使用标准的 nn.MSELoss
            loss_fct = nn.MSELoss() 
            
            # labels 是 float32 类型的 average (T)
            # 确保 labels 的形状与 predictions 匹配 (batch_size, 1)
            target = labels.float().view(-1, NUM_OUTPUTS)
            loss = loss_fct(predictions.view(-1, NUM_OUTPUTS), target) # predictions.view(-1, 1)

        # 返回结果 (loss, predictions) 或 predictions
        return (loss, predictions) if loss is not None else predictions

2025-12-04 12:16:11.586979: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764850571.749361      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764850571.795001      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

In [None]:
import os
os.environ["WANDB_MODE"] = "disabled"
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# from model import DebertaV2ForWSDScoring
# from data_load import MSEDataset

from transformers import DebertaV2Tokenizer, DebertaV2Config, Trainer, TrainingArguments

MODEL_NAME = "/kaggle/input/semeval/deberta-v3-large" 
TRAIN_JSON_PATH = "/kaggle/input/semeval/data/train.json"

# 1. 加载 Tokenizer
tokenizer = DebertaV2Tokenizer.from_pretrained(MODEL_NAME)

# 2. 加载配置
config = DebertaV2Config.from_pretrained(MODEL_NAME)

# 对于回归任务，模型的输出类别数量 (NUM_OUTPUTS) 应该是 1。
config.num_labels = 1 

# 3. 初始化
# DebertaV2ForWSDScoring 类已经修改为输出 1 个值并使用 nn.MSELoss
model = DebertaV2ForWSDScoring.from_pretrained(
    MODEL_NAME, 
    config=config
)

# 导入
train_dataset = MSEDataset(
    json_path=TRAIN_JSON_PATH, 
    tokenizer=tokenizer
)

OUTPUT_DIR = "/kaggle/working/output_regression"

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=10,                     
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    # 禁用评估
    eval_strategy="no", 
    load_best_model_at_end=False, 
    warmup_steps=500,                       
    weight_decay=0.01,                      
    logging_dir='./logs_regression',       
    logging_steps=50,                       
    save_strategy="no",                  # no
    learning_rate=2e-5,                     
    fp16=True,                              # 混合精度
    seed=42,                                # 固定的随机种子
)

# 实例化 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
)

# 启动训练
print("开始微调 DeBERTaV2 回归模型...")
trainer.train()

trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"训练完成，模型和tokenizer已保存至 {OUTPUT_DIR}")

Some weights of DebertaV2ForWSDScoring were not initialized from the model checkpoint at /kaggle/input/semeval/deberta-v3-large and are newly initialized: ['regressor.1.bias', 'regressor.1.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


创建了 2280 个回归训练样本




开始微调 DeBERTaV2 回归模型...




Step,Training Loss
50,23.9404
100,6.5273
150,5.7671
200,6.1756
250,5.5806
300,6.1814
350,5.2534
400,4.9178


训练完成，模型和tokenizer已保存至 /kaggle/working/output_regression


In [None]:
model.eval() 
print("Model")

import json
import torch
from torch.utils.data import DataLoader, SequentialSampler
from tqdm import tqdm

TEST_JSON_PATH = "/kaggle/input/semeval/data/dev.json" 
INFERENCE_BATCH_SIZE = 32 
OUTPUT_JSONL_PATH = "/kaggle/working/test_predictions.jsonl"


device = model.device 
model.eval() # 切换到评估模式

print(f"Model is on: {device}")
print(f"Using Test/Inference file: {TEST_JSON_PATH}")

dev_dataset = MSEDataset(
    json_path=TEST_JSON_PATH, 
    tokenizer=tokenizer, 
) 
dev_dataloader = DataLoader(
    dev_dataset,
    sampler=SequentialSampler(dev_dataset),
    batch_size=INFERENCE_BATCH_SIZE
)


all_results = []
print("\n***** 开始批量推理 *****")

with torch.no_grad():
    for batch in tqdm(dev_dataloader, desc="Inferencing"):
        
        # 核心：所有字段都已读入 batch 字典。我们只提取需要的输入和 ID。
        ids = batch["id"] 
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        
        # batch["labels"] (avg) 和 batch["stdevs"] 被一起读入，但忽略。
        
        # 运行模型 (不传入 labels，模型只做前向传播)
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # 提取预测值 (logits/scores)
        predictions = outputs[0] if isinstance(outputs, tuple) else outputs
        predicted_scores = predictions.squeeze().cpu().tolist()
        
        # 收集结果
        for json_key, score in zip(ids, predicted_scores): 
            # 最终评分四舍五入并限制在 [1.0, 5.0]
            final_score = round(max(1.0, min(5.0, score)))
            
            all_results.append({
                "id": json_key, 
                "prediction": final_score
            })

print("\n推理完成。")

# 保存为 JSON Lines (.jsonl)

print(f"开始保存 {len(all_results)} 条结果到 {OUTPUT_JSONL_PATH}...")

with open(OUTPUT_JSONL_PATH, 'w', encoding='utf-8') as f:
    for result in all_results:
        f.write(json.dumps(result) + '\n')

print(f"所有预测结果已保存到 {OUTPUT_JSONL_PATH}")

for result in all_results[:5]:
    print(f"{result}")