In [1]:
from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader
from torchmetrics import F1Score
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
from transformers import BertTokenizer, BertForSequenceClassification
from peft import get_peft_model, LoraConfig, TaskType
import pytorch_lightning as pl
import tqdm
from torch.nn import functional as F
from pytorch_lightning.loggers import TensorBoardLogger
from torch.nn.utils.rnn import pad_sequence
from huggingface_hub.hf_api import HfFolder
HfFolder.save_token('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')

In [2]:
miracl_ru = load_dataset('miracl/miracl', 'ru')

In [8]:
#training set:
train_data = []
for data in miracl_ru['train']:
    for neg in data['negative_passages']:
        temp_dict = {}
        temp_dict['question'] = data['query']
        temp_dict['answer'] = neg['text']
        temp_dict['label'] = 0
        train_data.append(temp_dict)
    for pos in data['positive_passages']:
        temp_dict = {}
        temp_dict['question'] = data['query']
        temp_dict['answer'] = pos['text']
        temp_dict['label'] = 1
        train_data.append(temp_dict)
train_data

[{'question': 'Когда был спущен на воду первый миноносец «Спокойный»?',
  'answer': 'Стерегу́щий\xa0— русский миноносец типа «Сокол». Заложен в 1900 году на Невском заводе (Санкт-Петербург), имея тогда название «Кулик». Спущен на воду в июне 1902 году под именем «Стерегущий» в Порт-Артуре, куда по частям был доставлен железнодорожным транспортом. В строй вступил в августе 1903 года. Погиб в неравном бою с превосходящими силами японцев а во время русско-японской войны. Против миноносцев «Стерегущий» и «Решительный» действовало четыре японских миноносца, которые по вооружению, водоизмещению и количеству экипажа существенно превосходили русские миноносцы.',
  'label': 0},
 {'question': 'Когда был спущен на воду первый миноносец «Спокойный»?',
  'answer': 'Эскадренный миноносец заложен в 1900 году на Невском заводе в Санкт-Петербурге под названием «Перепел». 27 февраля 1902 года начат сборкой, и 4 мая 1903 года спущен на воду в Порт-Артуре. Принят в казну в декабре 1903 года. С 9 марта 190

In [10]:
class QADataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        question = self.data[idx]['question']
        answer = self.data[idx]['answer']
        label = self.data[idx]['label']

        inputs = self.tokenizer(
            question, answer, 
            padding='max_length', 
            truncation=True, 
            max_length=self.max_length, 
            return_tensors="pt"
        )

        inputs = {key: val.squeeze(0) for key, val in inputs.items()}  # убираем лишнюю размерность
        inputs['labels'] = torch.tensor(label, dtype=torch.long)
        return inputs


In [11]:

class QAClassificationModel(pl.LightningModule):
    def __init__(self, model_name='bert-base-multilingual-cased', learning_rate=5e-5):
        super().__init__()
        self.learning_rate = learning_rate
        self.save_hyperparameters()

        # Инициализируем mBERT
        self.model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

        # Настройка LoRA
        lora_config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            r=8,
            lora_alpha=32,
            lora_dropout=0.1,
            bias="none"
        )
        self.model = get_peft_model(self.model, lora_config)

        # Инициализируем F1-метрику для бинарной классификации
        self.f1_metric = F1Score(num_classes=2, average='macro', task='binary')

    def forward(self, input_ids, attention_mask, token_type_ids, labels=None):
        return self.model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=labels)

    def training_step(self, batch, batch_idx):
        outputs = self(
            input_ids=batch['input_ids'], 
            attention_mask=batch['attention_mask'], 
            token_type_ids=batch['token_type_ids'], 
            labels=batch['labels']
        )
        loss = outputs.loss
        
        # Логируем train loss
        self.log('train_loss', loss, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        outputs = self(
            input_ids=batch['input_ids'], 
            attention_mask=batch['attention_mask'], 
            token_type_ids=batch['token_type_ids'], 
            labels=batch['labels']
        )
        loss = outputs.loss

        # Предсказания и реальные значения для расчета метрики
        preds = torch.argmax(outputs.logits, dim=1)
        labels = batch['labels']
        
        # Логируем val loss
        self.log('val_loss', loss, prog_bar=True, logger=True)
        
        # Обновляем и логируем F1-score
        f1 = self.f1_metric(preds, labels)
        self.log('val_f1', f1, prog_bar=True, logger=True)

        return loss

    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=self.learning_rate)

In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
train_dataset = QADataset(train_data, tokenizer, max_length=128)
train_data_loader = DataLoader(train_dataset, batch_size=128)


In [13]:
# Инициализируем логгер TensorBoard
logger = TensorBoardLogger("QA_logs", name="qa_model")

# Создаем экземпляр Trainer с логгером
trainer = pl.Trainer(
    max_epochs=10,
    logger=logger,
    accelerator="gpu",
    log_every_n_steps=10  # Логировать каждые 10 шагов
)

model = QAClassificationModel()


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
import warnings
warnings.filterwarnings("ignore", message="Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some toke")

In [None]:
trainer.fit(model, train_data_loader)

/home/user/.local/lib/python3.10/site-packages/pytorch_lightning/trainer/configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
You are using a CUDA device ('NVIDIA A100-PCIE-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
2024-10-14 04:17:22.604595: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-14 04:17:22.743988: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8463] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-14 04:17:22.783511: E ex

Training: |          | 0/? [00:00<?, ?it/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai