In [1]:
import json
import torch
from torch.utils.data import Dataset
from transformers import DebertaV2Tokenizer

class SimpleWSDDataset(Dataset):
    """最简单的WSD数据集"""
    
    def __init__(self, json_path, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.samples = []
        
        # 1. 读取JSON
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        # 2. 展平：每个choice变成一个样本
        for key, item in data.items():
            homonym = item["homonym"]
            definition = item["judged_meaning"]
            example = item["example_sentence"]
            
            # 完整上下文
            context = f"{item['precontext']} {item['sentence']} {item['ending']}"
            
            # 为每个有效的choice创建一个样本
            for choice_idx, (score, nonsensical) in enumerate(zip(item["choices"], item["nonsensical"])):
                if not nonsensical:  # 只取有效的
                    self.samples.append({
                        "homonym": homonym,
                        "definition": definition,
                        "example": example,
                        "context": context,
                        "score": score,  # 1-5分
                        "sample_id": f"{item['sample_id']}_{choice_idx}"  # 唯一ID
                    })
        
        print(f"创建了 {len(self.samples)} 个训练样本")
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        sample = self.samples[idx]
        
        # 最简单的输入格式：定义 + 例句 + 待判断文本
        # text = (
        #     f"homonym：{sample['homonym']} [SEP] "
        #     f"Definition:{sample['definition']} [SEP] "
        #     f"Example:{sample['example']} [SEP] "
        #     f"Context:{sample['context']}"
        # )
        
        text_parts = (
            f"homonym：{sample['homonym']}"
            f"Definition:{sample['definition']}"
            f"Example:{sample['example']}"
            f"Context:{sample['context']}"
        )
        text = self.tokenizer.sep_token.join(text_parts)
        
        # Tokenize
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        
        # 移除batch维度
        encoding = {k: v.squeeze(0) for k, v in encoding.items()}
        
        
        
        # 添加labels
        # encoding["labels"] = torch.tensor(sample["score"], dtype=torch.float32)
        # 回归的时候再用这个
        
        
        score_index = sample["score"] - 1  # 转为0-4索引
        encoding["labels"] = torch.tensor(score_index, dtype=torch.long)
        
        if "token_type_ids" in encoding:
             del encoding["token_type_ids"]
             
        return encoding

In [2]:
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch.nn as nn
from transformers import DebertaV2Model, DebertaV2PreTrainedModel

# 类别数量是 5 （对应 1, 2, 3, 4, 5 分）
NUM_LABELS = 5 

class DebertaV2ForWSDScoring(DebertaV2PreTrainedModel):
    """
    继承 DeBERTaV2Model，添加一个输出5个类别的分类头，
    用于 1-5 分的合理性评分任务。
    """
    
    def __init__(self, config):
        super().__init__(config)
        
        # 1. DeBERTaV2 主体 (用于特征提取)
        self.deberta = DebertaV2Model(config)
        
        # 2. 分类头 (Classification Head)
        self.classifier = nn.Sequential(
            # dropout rate从config中获取
            nn.Dropout(config.hidden_dropout_prob), 
            # 将 DeBERTaV2 的隐藏状态维度映射到 5 个类别
            nn.Linear(config.hidden_size, NUM_LABELS) 
        )
        
        # 初始化权重
        self.post_init() 

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        
        # 运行 DeBERTaV2 主体
        # DeBERTaV2 模型的 forward 默认不使用 token_type_ids，如果数据集中有，会被 **kwargs 吸收
        outputs = self.deberta(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )
        
        # 提取 [CLS] token 的隐藏状态（第一个 token 的输出）
        # (batch_size, hidden_size)
        cls_output = outputs[0][:, 0, :]
        
        # 运行分类头
        # (batch_size, NUM_LABELS)
        logits = self.classifier(cls_output) 

        loss = None
        if labels is not None:
            # 交叉熵损失函数
            # logits: (batch_size, 5)
            # labels: (batch_size,) 且值范围在 [0, 4]
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, NUM_LABELS), labels.view(-1))

        # 返回结果 (loss, logits) 或 logits
        return (loss, logits) if loss is not None else logits

2025-12-03 13:25:05.258476: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764768305.492037      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764768305.565587      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

In [3]:
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
os.environ["WANDB_MODE"] = "disabled"
from transformers import DebertaV2Tokenizer, DebertaV2Config, Trainer, TrainingArguments

# from model import DebertaV2ForWSDScoring
# from data_load import SimpleWSDDataset


# 建议使用相对较小的版本开始，以节省资源
MODEL_NAME = "/kaggle/input/semeval/deberta-v3-large" 

# 1. 加载 Tokenizer
tokenizer = DebertaV2Tokenizer.from_pretrained(MODEL_NAME)

# 2. 加载配置（用于初始化您的模型类）
config = DebertaV2Config.from_pretrained(MODEL_NAME)
config.num_labels = 5 # 确保配置中标签数正确

# 3. 初始化您的自定义模型
# 假设您已在脚本中导入了 DebertaV2ForWSDScoring 类
model = DebertaV2ForWSDScoring.from_pretrained(
    MODEL_NAME, 
    config=config
)

# 导入 SimpleWSDDataset 类
train_dataset = SimpleWSDDataset(
    json_path="/kaggle/input/semeval/data/train.json", 
    tokenizer=tokenizer
)

OUTPUT_DIR = "/kaggle/working/"

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,                     
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    # gradient_checkpointing=True,
    # ------------------------------------------------
    # 核心修改：禁用评估
    eval_strategy="no",               # <--- 禁用评估
    load_best_model_at_end=False,           # <--- 禁用加载最佳模型
    # ------------------------------------------------
    warmup_steps=500,                       
    weight_decay=0.01,                      
    logging_dir='./logs',                   
    logging_steps=50,                       
    save_strategy="epoch",                  # 训练完每一轮就保存一次
    learning_rate=2e-5,                     
    fp16=True,                              
)

# 实例化 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
)

# 启动训练
print("开始微调 DeBERTaV2 模型...")
trainer.train()

# 训练结束后，保存最终模型
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

Some weights of DebertaV2ForWSDScoring were not initialized from the model checkpoint at /kaggle/input/semeval/deberta-v3-large and are newly initialized: ['classifier.1.bias', 'classifier.1.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


创建了 11166 个训练样本




开始微调 DeBERTaV2 模型...




Step,Training Loss
50,6.7236
100,6.3037
150,6.5715
200,6.5068
250,6.5895
300,6.5231
350,6.2618
400,6.5364
450,6.5684
500,6.4991




('/kaggle/working/tokenizer_config.json',
 '/kaggle/working/special_tokens_map.json',
 '/kaggle/working/spm.model',
 '/kaggle/working/added_tokens.json')