In [1]:
!nvidia-smi

Tue Feb  4 19:25:27 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.4     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          Off | 00000000:8D:00.0 Off |                    0 |
| N/A   25C    P0              95W / 400W |      4MiB / 81920MiB |    100%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
pip show datasets

Name: datasets
Version: 3.2.0
Summary: HuggingFace community-driven open-source library of datasets
Home-page: https://github.com/huggingface/datasets
Author: HuggingFace Inc.
Author-email: thomas@huggingface.co
License: Apache 2.0
Location: /usr/local/lib/python3.10/dist-packages
Requires: aiohttp, dill, filelock, fsspec, huggingface-hub, multiprocess, numpy, packaging, pandas, pyarrow, pyyaml, requests, tqdm, xxhash
Required-by: evaluate, outlines, trl
Note: you may need to restart the kernel to use updated packages.


In [23]:
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from sklearn.metrics import precision_recall_fscore_support, accuracy_score




In [24]:
import torch
from torch import nn
import torch.nn.functional as F
from transformers import (
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    PreTrainedModel,
    Trainer,
    TrainingArguments,
)
from datasets import load_dataset

# 1. Определяем кастомную модель с дополнительными "головами"
class DebertaWithHeads(PreTrainedModel):
    config_class = AutoConfig  # для совместимости с конфигурацией модели

    def __init__(self, model_name_or_path, threshold=0.5):
        config = AutoConfig.from_pretrained(model_name_or_path)
        super().__init__(config)
        # Загружаем базовую модель DeBERTa-v3-Large
        self.deberta = AutoModel.from_pretrained(model_name_or_path)
        hidden_size = config.hidden_size

        # Добавляем предсказательные головы (каждая – один линейный слой)
        # Для контекста: два heads для Relevance и Utilization
        self.relevance_head = nn.Linear(hidden_size, 1)
        self.utilization_head = nn.Linear(hidden_size, 1)
        # Для ответа: один head для Adherence
        self.adherence_head = nn.Linear(hidden_size, 1)

        self.threshold = threshold
        self.loss_fct = nn.BCEWithLogitsLoss()  # функция потерь для бинарной классификации

    def forward(
        self,
        input_ids,
        attention_mask,
        context_mask,   # булевая маска для токенов контекста (документов)
        response_mask,  # булевая маска для токенов ответа
        labels_relevance=None,
        labels_utilization=None,
        labels_adherence=None,
    ):
        # Получаем скрытые представления из базовой модели
        outputs = self.deberta(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state  # [batch_size, seq_len, hidden_size]

        # Вычисляем логиты для каждой "головы"
        relevance_logits = self.relevance_head(last_hidden_state)    # [batch, seq_len, 1]
        utilization_logits = self.utilization_head(last_hidden_state)  # [batch, seq_len, 1]
        adherence_logits = self.adherence_head(last_hidden_state)      # [batch, seq_len, 1]

        output = {
            "relevance_logits": relevance_logits,
            "utilization_logits": utilization_logits,
            "adherence_logits": adherence_logits,
        }

        # Если переданы метки (labels), вычисляем loss
        if (
            labels_relevance is not None
            and labels_utilization is not None
            and labels_adherence is not None
        ):
            # Вычисляем потери только для тех токенов, где маска равна True
            # Для relevance и utilization используем маску context_mask
            rel_loss = self.loss_fct(
                relevance_logits.squeeze(-1)[context_mask],
                labels_relevance[context_mask]
            )
            util_loss = self.loss_fct(
                utilization_logits.squeeze(-1)[context_mask],
                labels_utilization[context_mask]
            )
            # Для adherence используем маску response_mask
            adh_loss = self.loss_fct(
                adherence_logits.squeeze(-1)[response_mask],
                labels_adherence[response_mask]
            )
            # Итоговая потеря – усреднение по всем трём головам
            loss = (rel_loss + util_loss + adh_loss) / 3.0
            output["loss"] = loss

        return output


In [25]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model_name = "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

premise = "I first thought that I liked the movie, but upon second thought it was actually disappointing."
hypothesis = "The movie was not good."

input = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt").to(device)
output = model(input["input_ids"].to(device))  # device = "cuda:0" or "cpu"
prediction = torch.softmax(output["logits"][0], -1).tolist()
label_names = ["entailment", "neutral", "contradiction"]
prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
print(prediction)


{'entailment': 83.5, 'neutral': 16.2, 'contradiction': 0.3}


In [26]:
model = DebertaWithHeads(model_name)

In [2]:


from datasets import load_dataset

ds = load_dataset("rungalileo/ragbench", "delucionqa")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
ds['train'][0]

{'id': '391',
 'question': 'How many batteries does the Stop/Start system need?',
 'response': 'The Stop/Start system needs two batteries.',
 'generation_model_name': 'gpt-3.5-turbo-0125',
 'annotating_model_name': 'gpt-4-turbo-2024-04-09',
 'dataset_name': 'delucionqa_train',
 'documents_sentences': [[['0a',
    ' Autostop Mode The Stop/Start feature is enabled after every normal engine start.'],
   ['0b', 'At that time, the system will go into STOP/START READY.'],
   ['0c',
   ['0d',
    'Both the main and the supplemental batteries must be disconnected to completely de-energize the 12 Volt electrical system.'],
   ['0e',
    'Serious injury or death could result if you do not disconnect both batteries.'],
   ['0f', 'To learn how to properly disconnect, see an authorized dealer.']],
  [['1a',
    ' Secondary Battery Your vehicle may be equipped with a secondary battery used to power the Stop/Start system and the 12 Volt vehicle electrical system.'],
   ['1b',
    'The secondary batte

In [29]:
import torch

def preprocess_function(example, max_length=4096):
    """
    Функция препроцессинга, которая:
      - Собирает входную последовательность в виде:
            question + [SEP] + documents + [SEP] + response
      - Для документов (documents_sentences) и ответа (response_sentences)
        проводит "broadcast" sentence-level меток на токены:
          * Если ключ предложения (например, "0a", "1b" и т.д.) входит в 
            all_relevant_sentence_keys, то все токены этого предложения получают метку 1.0 
            для relevance, иначе 0.0.
          * Аналогично для all_utilized_sentence_keys (метка utilization).
          * Для ответа, если adherence_score == True, то все токены получают метку 1.0 для adherence,
            иначе 0.0.
      - Токены запроса (question) не участвуют в потере (их маски остаются False).
    """
    # Извлекаем необходимые поля
    question = example["question"]
    # documents_sentences: список документов, где каждый документ – список предложений [sentence_key, sentence_text]
    documents_sentences = example.get("documents_sentences", [])
    # response_sentences: список предложений ответа (ожидается, что каждый элемент – пара [key, text])
    response_sentences = example.get("response_sentences", [])
    
    all_relevant_sentence_keys = example.get("all_relevant_sentence_keys", [])
    all_utilized_sentence_keys = example.get("all_utilized_sentence_keys", [])
    adherence_score = example.get("adherence_score", False)
    
    # Токенизуем запрос (без добавления спец. токенов, чтобы затем объединить вручную)
    question_ids = tokenizer.encode(question, add_special_tokens=False)
    
    # Обрабатываем документы: для каждого предложения токенизуем текст и выставляем метки
    doc_ids = []
    doc_labels_relevance = []
    doc_labels_utilization = []
    # Проходим по каждому документу
    for doc in documents_sentences:
        # doc – список предложений вида [sentence_key, sentence_text]
        for sentence in doc:
            sent_key, sent_text = sentence
            # Токенизуем предложение без спец. токенов
            sent_ids = tokenizer.encode(sent_text, add_special_tokens=False)
            doc_ids.extend(sent_ids)
            # Если ключ предложения в списке all_relevant_sentence_keys, метка = 1.0, иначе 0.0
            rel_label = 1.0 if sent_key in all_relevant_sentence_keys else 0.0
            util_label = 1.0 if sent_key in all_utilized_sentence_keys else 0.0
            doc_labels_relevance.extend([rel_label] * len(sent_ids))
            doc_labels_utilization.extend([util_label] * len(sent_ids))
    
    # Обрабатываем ответ: для каждого предложения токенизуем текст.
    # Здесь для простоты используем общий показатель adherence_score для всего ответа.
    response_ids = []
    response_labels_adherence = []
    adherence_label = 1.0 if adherence_score else 0.0
    for sentence in response_sentences:
        # Ожидается, что sentence имеет вид [key, text]. Если структура иная – адаптируйте.
        if isinstance(sentence, list) and len(sentence) == 2:
            sent_key, sent_text = sentence
        else:
            sent_text = sentence  # на случай, если пришёл просто текст
        sent_ids = tokenizer.encode(sent_text, add_special_tokens=False)
        response_ids.extend(sent_ids)
        response_labels_adherence.extend([adherence_label] * len(sent_ids))
    
    # Определяем идентификатор сепаратора
    sep_id = tokenizer.sep_token_id

    # Собираем итоговую последовательность:
    # Формат: question + [SEP] + documents + [SEP] + response
    input_ids = []
    # Добавляем запрос
    input_ids.extend(question_ids)
    input_ids.append(sep_id)
    # Запоминаем длину сегмента запроса (эти токены не участвуют в вычислении loss)
    question_end = len(input_ids)
    

    doc_start = len(input_ids)
    input_ids.extend(doc_ids)
    doc_end = len(input_ids)
    input_ids.append(sep_id)
    
    # Добавляем ответ
    resp_start = len(input_ids)
    input_ids.extend(response_ids)
    resp_end = len(input_ids)
    
    # Создаём attention_mask (1 для каждого токена)
    attention_mask = [1] * len(input_ids)
    
    # маски для loss:
    # Для relevance и utilization (документы) – True для токенов в span [doc_start, doc_end)
    context_mask = [False] * len(input_ids)
    for i in range(doc_start, doc_end):
        context_mask[i] = True
    # Для adherence (ответ) – True для токенов в span [resp_start, resp_end)
    response_mask = [False] * len(input_ids)
    for i in range(resp_start, resp_end):
        response_mask[i] = True
    
    # Формируем метки для relevance и utilization: для токенов не из документов ставим 0.0 (они не учитываются, т.к. mask = False)
    labels_relevance = [0.0] * len(input_ids)
    labels_utilization = [0.0] * len(input_ids)
    
    assert (doc_end - doc_start) == len(doc_labels_relevance), "Несовпадение длины токенов документов и меток relevance"
    for idx, label in enumerate(doc_labels_relevance):
        labels_relevance[doc_start + idx] = label
    for idx, label in enumerate(doc_labels_utilization):
        labels_utilization[doc_start + idx] = label
    
    
    labels_adherence = [0.0] * len(input_ids)
    assert (resp_end - resp_start) == len(response_labels_adherence), "Несовпадение длины токенов ответа и меток adherence"
    for idx, label in enumerate(response_labels_adherence):
        labels_adherence[resp_start + idx] = label
    
    if len(input_ids) < max_length:
        pad_length = max_length - len(input_ids)
        input_ids.extend([tokenizer.pad_token_id] * pad_length)
        attention_mask.extend([0] * pad_length)  # 0 для padding-токенов
        context_mask.extend([False] * pad_length)
        response_mask.extend([False] * pad_length)
        labels_relevance.extend([0.0] * pad_length)
        labels_utilization.extend([0.0] * pad_length)
        labels_adherence.extend([0.0] * pad_length)
    elif len(input_ids) > max_length:
        input_ids = input_ids[:max_length]
        attention_mask = attention_mask[:max_length]
        context_mask = context_mask[:max_length]
        response_mask = response_mask[:max_length]
        labels_relevance = labels_relevance[:max_length]
        labels_utilization = labels_utilization[:max_length]
        labels_adherence = labels_adherence[:max_length]
    
    return {
        "input_ids": torch.tensor(input_ids, dtype=torch.long),
        "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
        "context_mask": torch.tensor(context_mask, dtype=torch.bool),
        "response_mask": torch.tensor(response_mask, dtype=torch.bool),
        "labels_relevance": torch.tensor(labels_relevance, dtype=torch.float),
        "labels_utilization": torch.tensor(labels_utilization, dtype=torch.float),
        "labels_adherence": torch.tensor(labels_adherence, dtype=torch.float),
    }


In [30]:
preprocess_function(ds['train'][1])

{'input_ids': tensor([ 361,  264, 8436,  ...,    0,    0,    0]),
 'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0]),
 'context_mask': tensor([False, False, False,  ..., False, False, False]),
 'response_mask': tensor([False, False, False,  ..., False, False, False]),
 'labels_relevance': tensor([0., 0., 0.,  ..., 0., 0., 0.]),
 'labels_utilization': tensor([0., 0., 0.,  ..., 0., 0., 0.]),
 'labels_adherence': tensor([0., 0., 0.,  ..., 0., 0., 0.])}

In [31]:
data_collator([preprocess_function(ds['train'][1])])

{'input_ids': tensor([[ 361,  264, 8436,  ...,    0,    0,    0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0]]),
 'context_mask': tensor([[False, False, False,  ..., False, False, False]]),
 'response_mask': tensor([[False, False, False,  ..., False, False, False]]),
 'labels_relevance': tensor([[0., 0., 0.,  ..., 0., 0., 0.]]),
 'labels_utilization': tensor([[0., 0., 0.,  ..., 0., 0., 0.]]),
 'labels_adherence': tensor([[0., 0., 0.,  ..., 0., 0., 0.]])}

In [32]:
with torch.no_grad():
    outputs = model(**data_collator([preprocess_function(ds['train'][1])])
    )

In [33]:
outputs

{'relevance_logits': tensor([[[-0.4646],
          [ 0.0046],
          [ 0.0718],
          ...,
          [-0.5734],
          [-0.5734],
          [-0.5734]]]),
 'utilization_logits': tensor([[[-0.0457],
          [-0.1832],
          [ 0.3380],
          ...,
          [ 0.4029],
          [ 0.4029],
          [ 0.4029]]]),
 'adherence_logits': tensor([[[-0.6056],
          [-0.6762],
          [-0.3174],
          ...,
          [-0.1291],
          [-0.1291],
          [-0.1291]]]),
 'loss': tensor(0.6119)}

In [34]:
pred_rel_probs = 1 / (1 + np.exp(-outputs['relevance_logits']))
pred_rel_probs

tensor([[[0.3859],
         [0.5012],
         [0.5179],
         ...,
         [0.3604],
         [0.3604],
         [0.3604]]])

In [35]:
pred_rel_probs[0].squeeze(1)[217:230]

tensor([0.4184, 0.5324, 0.4624, 0.5942, 0.6398, 0.4216, 0.4849, 0.5814, 0.5441,
        0.6033, 0.6201, 0.5423, 0.4015])

In [36]:
pred_rel_probs = 1 / (1 + np.exp(-outputs['relevance_logits']))
pred_rel_probs

tensor([[[0.3859],
         [0.5012],
         [0.5179],
         ...,
         [0.3604],
         [0.3604],
         [0.3604]]])

In [37]:
processed = preprocess_function(ds['train'][1], max_length=1024)


tokens = tokenizer.convert_ids_to_tokens(processed["input_ids"].tolist())

labels_rel = processed["labels_relevance"].tolist()
labels_util = processed["labels_utilization"].tolist()
labels_adh = processed["labels_adherence"].tolist()

print(f"{'Token':20} | {'Relevance':9} | {'Utilization':11} | {'Adherence':9}")
print("-" * 60)
i = 0
for token, rel, util, adh in zip(tokens, labels_rel, labels_util, labels_adh):
    if token == '[PAD]':
        break
    i+=1
    print(f"{token:20} | {rel:9.1f} | {util:11.1f} | {adh:9.1f}")
    if token == '▁Simply':
        print(i)

Token                | Relevance | Utilization | Adherence
------------------------------------------------------------
▁how                 |       0.0 |         0.0 |       0.0
▁to                  |       0.0 |         0.0 |       0.0
▁customize           |       0.0 |         0.0 |       0.0
▁U                   |       0.0 |         0.0 |       0.0
connect              |       0.0 |         0.0 |       0.0
▁system              |       0.0 |         0.0 |       0.0
▁based               |       0.0 |         0.0 |       0.0
▁on                  |       0.0 |         0.0 |       0.0
▁my                  |       0.0 |         0.0 |       0.0
▁own                 |       0.0 |         0.0 |       0.0
▁preferences         |       0.0 |         0.0 |       0.0
?                    |       0.0 |         0.0 |       0.0
[SEP]                |       0.0 |         0.0 |       0.0
▁U                   |       0.0 |         0.0 |       0.0
connect              |       0.0 |         0.0 |      

In [38]:
def has_sentences(example):
    """
    Функция возвращает True, если пример содержит хотя бы одно предложение
    в documents_sentences (и, опционально, в response_sentences).
    Иначе возвращает False.
    """

    if "documents_sentences" not in example:
        return False
    if not isinstance(example["documents_sentences"], list):
        return False
    total_doc_sentences = 0
    for doc in example["documents_sentences"]:
        if isinstance(doc, list):
            total_doc_sentences += len(doc)
    if total_doc_sentences == 0:
        return False

    if "response_sentences" in example:
        if not isinstance(example["response_sentences"], list):
            return False
        if len(example["response_sentences"]) == 0:
            return False

    return True
print(ds)
cleaned_dataset = ds.filter(has_sentences)
print(cleaned_dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'documents', 'response', 'generation_model_name', 'annotating_model_name', 'dataset_name', 'documents_sentences', 'response_sentences', 'sentence_support_information', 'unsupported_response_sentence_keys', 'adherence_score', 'overall_supported_explanation', 'relevance_explanation', 'all_relevant_sentence_keys', 'all_utilized_sentence_keys', 'trulens_groundedness', 'trulens_context_relevance', 'ragas_faithfulness', 'ragas_context_relevance', 'gpt3_adherence', 'gpt3_context_relevance', 'gpt35_utilization', 'relevance_score', 'utilization_score', 'completeness_score'],
        num_rows: 1460
    })
    validation: Dataset({
        features: ['id', 'question', 'documents', 'response', 'generation_model_name', 'annotating_model_name', 'dataset_name', 'documents_sentences', 'response_sentences', 'sentence_support_information', 'unsupported_response_sentence_keys', 'adherence_score', 'overall_supported_explanation', 'rel

Filter: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1460/1460 [00:00<00:00, 2309.42 examples/s]
Filter: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 182/182 [00:00<00:00, 3196.24 examples/s]
Filter: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 184/184 [00:00<00:00, 2976.78 examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'documents', 'response', 'generation_model_name', 'annotating_model_name', 'dataset_name', 'documents_sentences', 'response_sentences', 'sentence_support_information', 'unsupported_response_sentence_keys', 'adherence_score', 'overall_supported_explanation', 'relevance_explanation', 'all_relevant_sentence_keys', 'all_utilized_sentence_keys', 'trulens_groundedness', 'trulens_context_relevance', 'ragas_faithfulness', 'ragas_context_relevance', 'gpt3_adherence', 'gpt3_context_relevance', 'gpt35_utilization', 'relevance_score', 'utilization_score', 'completeness_score'],
        num_rows: 1458
    })
    validation: Dataset({
        features: ['id', 'question', 'documents', 'response', 'generation_model_name', 'annotating_model_name', 'dataset_name', 'documents_sentences', 'response_sentences', 'sentence_support_information', 'unsupported_response_sentence_keys', 'adherence_score', 'overall_supported_explanation', 'rel




In [42]:
print(cleaned_dataset['train'][0].keys())

dict_keys(['id', 'question', 'documents', 'response', 'generation_model_name', 'annotating_model_name', 'dataset_name', 'documents_sentences', 'response_sentences', 'sentence_support_information', 'unsupported_response_sentence_keys', 'adherence_score', 'overall_supported_explanation', 'relevance_explanation', 'all_relevant_sentence_keys', 'all_utilized_sentence_keys', 'trulens_groundedness', 'trulens_context_relevance', 'ragas_faithfulness', 'ragas_context_relevance', 'gpt3_adherence', 'gpt3_context_relevance', 'gpt35_utilization', 'relevance_score', 'utilization_score', 'completeness_score'])


In [14]:
for elem  in cleaned_dataset['test']:
    if not elem["documents_sentences"]:
        print(elem)

In [15]:
def preprocess_batch(examples):

    outputs = [preprocess_function(example) for example in zip(*examples.values())]

    return outputs

tokenized_dataset = cleaned_dataset.map(lambda ex: preprocess_function(ex), remove_columns=cleaned_dataset["train"].column_names)



In [10]:
def data_collator(features):
    """
    Собирает список примеров (dict) в один батч,
    используя pad_sequence для токенов и масок. Если какие-то поля представлены в виде list,
    они конвертируются в torch.Tensor.
    """

    def to_tensor(x, dtype):
        return x if isinstance(x, torch.Tensor) else torch.tensor(x, dtype=dtype)
    
    batch_input_ids = torch.nn.utils.rnn.pad_sequence(
        [to_tensor(f["input_ids"], torch.long) for f in features],
        batch_first=True,
        padding_value=tokenizer.pad_token_id,
    )
    
    batch_attention_mask = torch.nn.utils.rnn.pad_sequence(
        [to_tensor(f["attention_mask"], torch.long) for f in features],
        batch_first=True,
        padding_value=0,
    )
    
    batch_context_mask = torch.nn.utils.rnn.pad_sequence(
        [to_tensor(f["context_mask"], torch.bool) for f in features],
        batch_first=True,
        padding_value=False,
    )
    
    batch_response_mask = torch.nn.utils.rnn.pad_sequence(
        [to_tensor(f["response_mask"], torch.bool) for f in features],
        batch_first=True,
        padding_value=False,
    )
    
    batch_labels_relevance = torch.nn.utils.rnn.pad_sequence(
        [to_tensor(f["labels_relevance"], torch.float) for f in features],
        batch_first=True,
        padding_value=0.0,
    )
    
    batch_labels_utilization = torch.nn.utils.rnn.pad_sequence(
        [to_tensor(f["labels_utilization"], torch.float) for f in features],
        batch_first=True,
        padding_value=0.0,
    )
    
    batch_labels_adherence = torch.nn.utils.rnn.pad_sequence(
        [to_tensor(f["labels_adherence"], torch.float) for f in features],
        batch_first=True,
        padding_value=0.0,
    )
    
    return {
        "input_ids": batch_input_ids,
        "attention_mask": batch_attention_mask,
        "context_mask": batch_context_mask,
        "response_mask": batch_response_mask,
        "labels_relevance": batch_labels_relevance,
        "labels_utilization": batch_labels_utilization,
        "labels_adherence": batch_labels_adherence,
    }

In [18]:
from transformers import Trainer, TrainingArguments


training_args = TrainingArguments(
    output_dir="./deberta-finetuned_2",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_steps=100,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_steps=500,
    report_to=[],
    fp16=True 
)





model = DebertaWithHeads(model_name).to(device)





In [19]:
# tokenized_dataset.set_format(
#     type="torch",
#     columns=[
#         "input_ids", "attention_mask", "context_mask", "response_mask",
#         "labels_relevance", "labels_utilization", "labels_adherence"
#     ]
# )
# tokenized_dataset

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,    
#     compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)




  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
trainer.train()

[2025-02-04 19:25:48,696] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


Step,Training Loss,Validation Loss
500,0.3366,0.361947
1000,0.3,0.364985
1500,0.3056,0.417045
2000,0.291,0.283464
2500,0.2669,0.289229
3000,0.2069,0.328456
3500,0.1919,0.370309
4000,0.1955,0.338295


In [None]:
trainer.state

In [27]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from collections import defaultdict
def evaluate_model(model, dataset, data_collator, device="cuda"):
    """
    Оценивает модель на датасете.
    
    :param model: Модель для оценки
    :param dataset: Датасет для оценки
    :param data_collator: Функция для создания батчей
    :param device: Устройство для вычислений (по умолчанию 'cuda')
    :return: Словарь с метриками оценки
    """
    model.to(device)
    model.eval()
    
    dataloader = DataLoader(dataset, batch_size=8, collate_fn=data_collator,shuffle=False )
    
    total_loss = 0.0
    total_relevance_loss = 0.0
    total_utilization_loss = 0.0
    total_adherence_loss = 0.0
    labels = defaultdict(list)
    preds = defaultdict(list)
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            context_mask = batch["context_mask"].to(device)
            response_mask = batch["response_mask"].to(device)
            labels_relevance = batch["labels_relevance"].to(device)
            labels_utilization = batch["labels_utilization"].to(device)
            labels_adherence = batch["labels_adherence"].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, context_mask=context_mask, response_mask=response_mask)

            preds['context_mask'].append(context_mask.detach().cpu().numpy())
            preds['response_mask'].append(response_mask.detach().cpu().numpy()) 
            preds['relevance_logits'].append(outputs['relevance_logits'].detach().cpu().numpy())
            preds['utilization_logits'].append(outputs['utilization_logits'].detach().cpu().numpy())
            preds['adherence_logits'].append(outputs['adherence_logits'].detach().cpu().numpy())
            
            labels['labels_relevance'].append(labels_relevance.detach().cpu().numpy())
            labels['labels_utilization'].append(labels_utilization.detach().cpu().numpy())
            labels['labels_adherence'].append(labels_adherence.detach().cpu().numpy())

    for key in preds:
        shapes = [arr.shape for arr in preds[key]]
        if len(set([shape[1:] for shape in shapes])) > 1:
            print(f"Предупреждение: массивы в '{key}' имеют разную форму. Они не будут объединены.")
            continue
        preds[key] = np.concatenate(preds[key], axis=0)
        
    for key in labels:
        labels[key] = np.concatenate(labels[key], axis=0)
    return {
        "pred": preds,
        "labels": labels,
    }



predicts = evaluate_model(model, tokenized_dataset["validation"], data_collator, device)
print(predicts.keys())

Evaluating: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 23/23 [01:24<00:00,  3.67s/it]

dict_keys(['pred', 'labels'])





In [29]:
predicts['pred']['relevance_logits'].shape

(182, 4096, 1)

In [32]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
def compute_metrics(predictions, label_ids):
    pred_rel_logits = predictions['relevance_logits']
    pred_util_logits = predictions['utilization_logits']
    pred_adh_logits = predictions['adherence_logits']
    # Преобразование логитов в вероятности (если нужно)
    pred_rel_probs = 1 / (1 + np.exp(-pred_rel_logits))
    pred_util_probs = 1 / (1 + np.exp(-pred_util_logits))
    pred_adh_probs = 1 / (1 + np.exp(-pred_adh_logits))
    
    # Применяем порог 0.5 для получения бинарных предсказаний
    pred_rel = (pred_rel_probs > 0.5).astype(int)
    pred_util = (pred_util_probs > 0.5).astype(int)
    pred_adh = (pred_adh_probs > 0.5).astype(int)
    
    # Убираем лишнюю размерность, если необходимо
    pred_rel = np.squeeze(pred_rel, axis=-1)
    pred_util = np.squeeze(pred_util, axis=-1)
    pred_adh = np.squeeze(pred_adh, axis=-1)
    
    # Получаем истинные метки
    true_rel = label_ids['labels_relevance']
    true_util = label_ids['labels_utilization']
    true_adh = label_ids['labels_adherence']
    
    # Используем маски (предполагается, что они переданы в label_ids)
    context_mask = predictions["context_mask"]
    response_mask = predictions["response_mask"]
    
    pred_rel_eval = pred_rel[context_mask == 1]
    true_rel_eval = true_rel[context_mask == 1]
    pred_util_eval = pred_util[context_mask == 1]
    true_util_eval = true_util[context_mask == 1]
    pred_adh_eval = pred_adh[response_mask == 1]
    true_adh_eval = true_adh[response_mask == 1]
    
    from sklearn.metrics import precision_recall_fscore_support
    precision_rel, recall_rel, f1_rel, _ = precision_recall_fscore_support(true_rel_eval, pred_rel_eval, average="binary", zero_division=0)
    precision_util, recall_util, f1_util, _ = precision_recall_fscore_support(true_util_eval, pred_util_eval, average="binary", zero_division=0)
    precision_adh, recall_adh, f1_adh, _ = precision_recall_fscore_support(true_adh_eval, pred_adh_eval, average="binary", zero_division=0)
    
    return {
        "precision_relevance": precision_rel,
        "recall_relevance": recall_rel,
        "f1_relevance": f1_rel,
        "precision_utilization": precision_util,
        "recall_utilization": recall_util,
        "f1_utilization": f1_util,
        "precision_adherence": precision_adh,
        "recall_adherence": recall_adh,
        "f1_adherence": f1_adh,
    }
compute_metrics(predicts['pred'], predicts['labels'])

{'precision_relevance': 0.7457282802519016,
 'recall_relevance': 0.8185827552031715,
 'f1_relevance': 0.7804589962557433,
 'precision_utilization': 0.7978925548453965,
 'recall_utilization': 0.834306701186104,
 'f1_utilization': 0.8156934306569343,
 'precision_adherence': 0.9242230930465688,
 'recall_adherence': 0.9993594797004336,
 'f1_adherence': 0.9603238483026372}

In [None]:
import numpy as np
logits = predicts['pred']['relevance_logits']
# Предположим, logits имеет форму (num_samples, max_length, 1)
probs = 1 / (1 + np.exp(-logits))         # Применяем сигмоиду
preds = (probs > 0.5).astype(np.int32)      # Применяем порог 0.5, 1 = релевантный, 0 = нерелевантный
preds = np.squeeze(preds, axis=-1)          # Убираем лишнее измерение, получаем форму (num_samples, max_length)
relevant_context = preds * predicts['pred']['context_mask']

In [39]:
import numpy as np

def compute_example_relevance(logits, context_mask, threshold=0.5):
    """
    Вычисляет агрегированную релевантность для каждого примера.
    
    Параметры:
      logits: numpy-массив логитов, размер (num_samples, max_length, 1)
      context_mask: numpy-массив булевых значений, размер (num_samples, max_length),
                    где True указывает на токены, принадлежащие документам.
      threshold: порог для определения бинарного решения (по умолчанию 0.5)
    
    Возвращает:
      relevance_scores: numpy-массив агрегированной релевантности для каждого примера.
    """
    # Убираем лишнее измерение, если оно есть
    if logits.ndim == 3 and logits.shape[-1] == 1:
        logits = np.squeeze(logits, axis=-1)  # (num_samples, max_length)
    
    # Применяем сигмоиду, чтобы получить вероятности
    probs = 1 / (1 + np.exp(-logits))
    
    # Применяем пороговое значение для бинаризации
    binary_preds = (probs > threshold).astype(np.int32)  # 1: релевантный, 0: нерелевантный

    # Если context_mask не в виде numpy-массива, приводим его:
    if not isinstance(context_mask, np.ndarray):
        context_mask = np.array(context_mask)
    
    # Преобразуем маску в 0/1 (если она булевая)
    context_mask = context_mask.astype(np.int32)

    num_samples = binary_preds.shape[0]
    relevance_scores = np.zeros(num_samples, dtype=np.float32)
    
    for i in range(num_samples):
        # Количество токенов контекста для i-го примера:
        total_context_tokens = np.sum(context_mask[i])
        if total_context_tokens == 0:
            # На случай, если по ошибке контекст отсутствует – избегаем деления на ноль.
            relevance_scores[i] = 0.0
        else:
            # Количество токенов, предсказанных как релевантные, среди контекстных токенов:
            relevant_tokens = np.sum(binary_preds[i] * context_mask[i])
            relevance_scores[i] = relevant_tokens / total_context_tokens
    return relevance_scores

relevance = compute_example_relevance(predicts['pred']['relevance_logits'], predicts['pred']['context_mask'])
print("Relevance для каждого примера:", relevance)


Relevance для каждого примера: [0.29324323 0.42156863 0.68654823 0.10714286 0.16936672 0.10344828
 0.37967914 0.14071856 0.93333334 0.7808765  0.11922141 0.29787233
 0.04932183 0.16626506 0.05374823 0.08422175 0.1902439  0.06461087
 0.24835989 0.27447394 0.2698283  0.19732441 0.40429044 0.19077568
 0.73960984 0.5772595  0.08196721 0.15826613 0.25259516 0.18444996
 0.24412751 0.27916667 0.16625917 0.39732888 0.5582233  0.23373494
 0.06642567 0.04941861 0.15272728 0.1634981  0.22625    0.940113
 0.42631578 0.29303548 0.41472867 0.04978663 0.15151516 0.32415903
 0.06138108 0.08044164 0.5062007  0.21484992 0.23412395 0.05479452
 0.13197969 0.13032258 0.2912874  0.04060564 0.73401165 0.04878049
 0.10373066 0.07501996 0.58080316 0.14314115 0.22603978 0.20083103
 0.15829529 0.3782117  0.17027026 0.13565892 0.8887015  0.
 0.0209205  0.37815127 0.5212947  0.17883755 0.35825545 0.63975906
 0.06680881 0.538      0.04723127 0.13917525 0.3866171  0.30414745
 0.7820069  0.3541147  0.59117085 0.31002

In [42]:
relevance.std(), relevance.mean()

(0.24809651, 0.32028112)

In [47]:

correlation_matrix = np.corrcoef(ds['validation']['relevance_score'], relevance)
print("Матрица корреляции:")
print(correlation_matrix)
r = correlation_matrix[0, 1]
print("Коэффициент корреляции Пирсона:", r)


Матрица корреляции:
[[1.         0.78981243]
 [0.78981243 1.        ]]
Коэффициент корреляции Пирсона: 0.7898124309487122


In [48]:
from scipy import stats

r, p_value = stats.pearsonr(ds['validation']['relevance_score'], relevance)
print("Коэффициент корреляции Пирсона:", r)
print("p-value:", p_value)

Коэффициент корреляции Пирсона: 0.7898124309487122
p-value: 4.592598180922281e-40


In [50]:
np.std(ds['validation']['relevance_score']), np.mean(ds['validation']['relevance_score'])

(0.2438518104729529, 0.29499628415536805)

In [52]:
ds['validation']

Dataset({
    features: ['id', 'question', 'documents', 'response', 'generation_model_name', 'annotating_model_name', 'dataset_name', 'documents_sentences', 'response_sentences', 'sentence_support_information', 'unsupported_response_sentence_keys', 'adherence_score', 'overall_supported_explanation', 'relevance_explanation', 'all_relevant_sentence_keys', 'all_utilized_sentence_keys', 'trulens_groundedness', 'trulens_context_relevance', 'ragas_faithfulness', 'ragas_context_relevance', 'gpt3_adherence', 'gpt3_context_relevance', 'gpt35_utilization', 'relevance_score', 'utilization_score', 'completeness_score'],
    num_rows: 182
})

In [56]:

utiliz = compute_example_relevance(predicts['pred']['utilization_logits'], predicts['pred']['context_mask'])
# print("utilizations для каждого примера:", utiliz)
correlation_matrix = np.corrcoef(ds['validation']['utilization_score'], utiliz)
print("Матрица корреляции:")
print(correlation_matrix)
r = correlation_matrix[0, 1]
print("Коэффициент корреляции Пирсона:", r)
r, p_value = stats.pearsonr(ds['validation']['utilization_score'], utiliz)
print("Коэффициент корреляции Пирсона:", r)
print("p-value:", p_value)

Матрица корреляции:
[[1.         0.82217548]
 [0.82217548 1.        ]]
Коэффициент корреляции Пирсона: 0.822175482694904
Коэффициент корреляции Пирсона: 0.822175482694904
p-value: 6.456021549967664e-46


In [57]:
r, p_value = stats.pearsonr(ds['validation']['utilization_score'], ds['validation']['relevance_score'])
print("Коэффициент корреляции Пирсона:", r)
print("p-value:", p_value)

Коэффициент корреляции Пирсона: 0.8194606036384028
p-value: 2.214780718793844e-45


In [61]:
utiliz.std(), utiliz.mean()

(0.19133641, 0.26340395)

In [37]:
DebertaWithHeads.from_pretrained('deberta-finetuned_2/checkpoint-4374')

OSError: Incorrect path_or_model_id: 'DebertaV2Config {
  "_name_or_path": "deberta-finetuned_2/checkpoint-4374",
  "architectures": [
    "DebertaWithHeads"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "entailment",
    "1": "neutral",
    "2": "contradiction"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "contradiction": 2,
    "entailment": 0,
    "neutral": 1
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 1024,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "torch_dtype": "float32",
  "transformers_version": "4.45.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}
'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [31]:
model.load_state_dict(state_dict["model"])

RuntimeError: Error(s) in loading state_dict for DebertaV2ForSequenceClassification:
	Missing key(s) in state_dict: "pooler.dense.weight", "pooler.dense.bias", "classifier.weight", "classifier.bias". 
	Unexpected key(s) in state_dict: "relevance_head.weight", "relevance_head.bias", "utilization_head.weight", "utilization_head.bias", "adherence_head.weight", "adherence_head.bias". 

In [20]:
ls deberta-finetuned_2/checkpoint-4374

added_tokens.json  rng_state.pth            tokenizer.json
config.json        scheduler.pt             tokenizer_config.json
model.safetensors  special_tokens_map.json  trainer_state.json
optimizer.pt       spm.model                training_args.bin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
