In [1]:
# import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import json
import torch
from torch.utils.data import Dataset
from transformers import DebertaV2Tokenizer

class MSEDataset(Dataset):
    """用于回归评分任务的WSD数据集"""
    
    def __init__(self, json_path, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.samples = []
        
        # 1. 读取JSON
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        # 2. 构造样本：每个原始条目（上下文/义项对）作为一个样本
        for key, item in data.items():
            
            # --- 文本信息 ---
            homonym = item["homonym"]
            definition = item["judged_meaning"]
            example = item["example_sentence"]
            # 完整上下文
            context = f"{item['precontext']} {item['sentence']} {item['ending']}"
            
            # --- 标签信息 ---
            # 直接使用平均值 (avg) 作为回归目标 T
            target_avg = item["average"] 
            # 使用标准差 (stdev) 作为损失函数中的容忍度 sigma
            target_stdev = item["stdev"]
            
            # 确保 avg 和 stdev 是有效的浮点数
            if target_avg is None or target_stdev is None:
                # 实际应用中可能需要更复杂的缺失值处理
                continue 
                
            self.samples.append({
                "json_key": key,
                "homonym": homonym,
                "definition": definition,
                "example": example,
                "context": context,
                "target_avg": target_avg,   # 平均分 (T)
                "target_stdev": target_stdev, # 标准差 (sigma)
                "sample_id": item['sample_id'] # 原始ID
            })
            
        print(f"创建了 {len(self.samples)} 个回归训练样本")
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        sample = self.samples[idx]
        
        # 构造输入文本
        text_parts = (
            f"homonym：{sample['homonym']}",
            f"Definition:{sample['definition']}",
            f"Example:{sample['example']}",
            f"Context:{sample['context']}"
        )
        # 使用tokenizer的sep_token连接各个部分
        text = self.tokenizer.sep_token.join(text_parts)
        
        # Tokenize
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        
        # 移除batch维度
        encoding = {k: v.squeeze(0) for k, v in encoding.items()}
        
        # 重点修改：添加两个回归标签
        
        # 1. average 作为 labels (T)
        encoding["labels"] = torch.tensor(sample["target_avg"], dtype=torch.float32)
        
        # 2. stdev 作为 stdevs (sigma)，用于自定义损失函数
        # 自定义时加重惩罚区间外的，均方误差（MSELoss时不需要）
        # encoding["stdevs"] = torch.tensor(sample["target_stdev"], dtype=torch.float32)

        encoding["id"] = sample["json_key"]
        
        if "token_type_ids" in encoding:
            del encoding["token_type_ids"]
            
        return encoding

In [2]:
import torch.nn as nn
from transformers import DebertaV2Model, DebertaV2PreTrainedModel
import torch

# 输出维度为 1
NUM_OUTPUTS = 1 

class DebertaV2ForWSDScoring(DebertaV2PreTrainedModel):
    
    def __init__(self, config):
        super().__init__(config)
        self.deberta = DebertaV2Model(config)
        
        # 回归头，输出维度为 1
        self.regressor = nn.Sequential(
            nn.Dropout(config.hidden_dropout_prob), 
            nn.Linear(config.hidden_size, NUM_OUTPUTS) 
        )
        self.post_init() 

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        # ... (DeBERTaV2 主体运行部分不变) ...
        outputs = self.deberta(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )
        
        cls_output = outputs[0][:, 0, :]
        predictions = self.regressor(cls_output) 

        loss = None
        if labels is not None:
            # 使用标准的 nn.MSELoss
            loss_fct = nn.L1Loss() 
            
            # labels 是 float32 类型的 average (T)
            # 确保 labels 的形状与 predictions 匹配 (batch_size, 1)
            target = labels.float().view(-1, NUM_OUTPUTS)
            loss = loss_fct(predictions.view(-1, NUM_OUTPUTS), target) # predictions.view(-1, 1)

        # 返回结果 (loss, predictions) 或 predictions
        return (loss, predictions) if loss is not None else predictions

2025-12-05 06:24:18.255744: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764915858.480789      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764915858.553548      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

In [3]:
import os
os.environ["WANDB_MODE"] = "disabled"
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# from model import DebertaV2ForWSDScoring
# from data_load import MSEDataset

from transformers import DebertaV2Tokenizer, DebertaV2Config, Trainer, TrainingArguments
# 假设您的模型和数据集类定义在 model.py 和 data_load.py 中
# from model import DebertaV2ForWSDScoring  # 您的回归模型
# from data_load import MSEDataset          # 您的回归数据集（原SimpleWSDDataset修改版）

# 建议使用相对较小的版本开始，以节省资源
MODEL_NAME = "/kaggle/input/semeval/deberta-v3-large" 
TRAIN_JSON_PATH = "/kaggle/input/semeval/data/train.json" # 假设您的数据路径

# 1. 加载 Tokenizer
tokenizer = DebertaV2Tokenizer.from_pretrained(MODEL_NAME)

# 2. 加载配置（用于初始化您的模型类）
config = DebertaV2Config.from_pretrained(MODEL_NAME)

# **【关键修改】**：
# 对于回归任务，模型的输出类别数量 (NUM_OUTPUTS) 应该是 1。
# 您的自定义模型 DebertaV2ForWSDScoring 的 __init__ 方法应该使用这个配置。
config.num_labels = 1 

# 3. 初始化您的自定义模型
# 请确保您的 DebertaV2ForWSDScoring 类已经修改为输出 1 个值并使用 nn.MSELoss
# 假设您已在脚本中导入了 DebertaV2ForWSDScoring 类
model = DebertaV2ForWSDScoring.from_pretrained(
    MODEL_NAME, 
    config=config
)

# 导入您的数据集类 (我们称之为 MSEDataset 或使用您修改后的 SimpleWSDDataset)
train_dataset = MSEDataset( # 假设这是您修改后输出 float labels (avg) 的类
    json_path=TRAIN_JSON_PATH, 
    tokenizer=tokenizer
)

OUTPUT_DIR = "/kaggle/working/output_regression"

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,                     
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    # ------------------------------------------------
    # 禁用评估
    eval_strategy="no", 
    load_best_model_at_end=False, 
    # ------------------------------------------------
    warmup_steps=500,                       
    weight_decay=0.01,                      
    logging_dir='./logs_regression',       
    logging_steps=50,                       
    save_strategy="no",                  # no
    learning_rate=2e-5,                     
    fp16=True,                              # 混合精度训练，加速
    # **【可选优化】**：报告指标为回归任务
    # metric_for_best_model="eval_loss",      # 尽管我们禁用了评估，但保留此设置
    # greater_is_better=False,
    seed=42,                                # 固定的随机种子
)

# 实例化 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
    # **【回归任务不需要 special data collator】**：
    # Data collator 默认会处理回归任务的 float labels
)

# 启动训练
print("开始微调 DeBERTaV2 回归模型...")
trainer.train()

# 训练结束后，保存最终模型
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"训练完成，模型和tokenizer已保存至 {OUTPUT_DIR}")

Some weights of DebertaV2ForWSDScoring were not initialized from the model checkpoint at /kaggle/input/semeval/deberta-v3-large and are newly initialized: ['regressor.1.bias', 'regressor.1.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


创建了 2280 个回归训练样本




开始微调 DeBERTaV2 回归模型...




Step,Training Loss
50,10.8847
100,4.4826
150,4.3817
200,4.2702
250,4.1418
300,4.4033
350,4.1335
400,3.9966


训练完成，模型和tokenizer已保存至 /kaggle/working/output_regression


In [4]:
model.eval() 
print("Model")

Model


In [5]:
import json
import torch
from torch.utils.data import DataLoader, SequentialSampler
from tqdm import tqdm
# 假设您已导入 MSEDataset 类

# ----------------------------------------------------------------------
# 路径和配置
# ----------------------------------------------------------------------
# 假设测试集路径为 /kaggle/input/semeval/data/test.json (或您提供的 dev.json)
TEST_JSON_PATH = "/kaggle/input/semeval/data/dev.json" 
INFERENCE_BATCH_SIZE = 32 
OUTPUT_JSONL_PATH = "/kaggle/working/test_predictions.jsonl"

# 确定设备 (假设 model 已经移动到正确的设备)
device = model.device 
model.eval() # 切换到评估模式

print(f"Model is on: {device}")
print(f"Using Test/Inference file: {TEST_JSON_PATH}")

# ----------------------------------------------------------------------
# 1. 实例化推理数据集和 DataLoader
# ----------------------------------------------------------------------
# 使用 MSEDataset，它会读取所有的字段 (包括 avg 作为 labels)
dev_dataset = MSEDataset(
    json_path=TEST_JSON_PATH, 
    tokenizer=tokenizer, 
) 
dev_dataloader = DataLoader(
    dev_dataset,
    sampler=SequentialSampler(dev_dataset),
    batch_size=INFERENCE_BATCH_SIZE
)

# ----------------------------------------------------------------------
# 2. 运行推理循环 (在代码层面忽略标签)
# ----------------------------------------------------------------------
all_results = []
print("\n***** 开始批量推理 *****")

with torch.no_grad():
    for batch in tqdm(dev_dataloader, desc="Inferencing"):
        
        # 核心：所有字段都已读入 batch 字典。我们只提取需要的输入和 ID。
        ids = batch["id"] 
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        
        # ⚠️ 注意：batch["labels"] (avg) 和 batch["stdevs"] 被一起读入，但我们在这里忽略了它们。
        
        # 运行模型 (不传入 labels，模型只做前向传播)
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # 提取预测值 (logits/scores)
        predictions = outputs[0] if isinstance(outputs, tuple) else outputs
        predicted_scores = predictions.squeeze().cpu().tolist()
        
        # 收集结果
        for json_key, score in zip(ids, predicted_scores): 
            # 最终评分四舍五入并限制在 [1.0, 5.0]
            final_score = round(max(1.0, min(5.0, score)))
            
            all_results.append({
                "id": json_key, 
                "prediction": final_score
            })

print("\n推理完成。")

# ----------------------------------------------------------------------
# 3. 保存为 JSON Lines (.jsonl)
# ----------------------------------------------------------------------

print(f"开始保存 {len(all_results)} 条结果到 {OUTPUT_JSONL_PATH}...")

with open(OUTPUT_JSONL_PATH, 'w', encoding='utf-8') as f:
    for result in all_results:
        f.write(json.dumps(result) + '\n')

print(f"所有预测结果已保存到 {OUTPUT_JSONL_PATH}")

# 打印全部结果
print("\n--- 全部预测结果 (JSON Lines 格式) ---")
for result in all_results[:5]:
    print(f"{result}")

Model is on: cuda:0
Using Test/Inference file: /kaggle/input/semeval/data/dev.json
创建了 588 个回归训练样本

***** 开始批量推理 *****


Inferencing: 100%|██████████| 19/19 [01:22<00:00,  4.32s/it]


推理完成。
开始保存 588 条结果到 /kaggle/working/test_predictions.jsonl...
所有预测结果已保存到 /kaggle/working/test_predictions.jsonl

--- 全部预测结果 (JSON Lines 格式) ---
{'id': '0', 'prediction': 3}
{'id': '1', 'prediction': 3}
{'id': '2', 'prediction': 3}
{'id': '3', 'prediction': 3}
{'id': '4', 'prediction': 3}



