In [1]:
!pip install --quiet nlp==0.2.0

from transformers import BertTokenizer, BertForMaskedLM
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
import transformers
import nlp
from datasets import load_dataset, Dataset
import matplotlib.pyplot as plt
import logging
logging.basicConfig(level=logging.CRITICAL)

import json
from tqdm.notebook import tqdm

tqdm.pandas()

import pandas as pd
from datasets import Dataset

def load_csv_dataset(train_path, val_path, test_path, column_names, encoding="utf-8-sig"):
    train_data = pd.read_csv(train_path, encoding=encoding, sep=';')
    val_data = pd.read_csv(val_path, encoding=encoding, sep=';')
    test_data = pd.read_csv(test_path, encoding=encoding, sep=';')

    train_data.columns = column_names
    val_data.columns = column_names
    test_data.columns = column_names

    train_dataset = Dataset.from_pandas(train_data)
    val_dataset = Dataset.from_pandas(val_data)
    test_dataset = Dataset.from_pandas(test_data)

    return {"train": train_dataset, "validation": val_dataset, "test": test_dataset}

sts_columns = ["sentence1", "sentence2", "similarity_score"]
sentiment_columns = ["id", "text", "sentiment"]
paraphrase_columns = ["sentence1", "sentence2"]
qg_columns = ["id", "text", "predicted_question"]
summarization_columns = ["id", "generated_text", "summary"]
title_gen_columns = ["id", "text", "generated_text"]

In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
path_to_sentiment_train = "Train_Only_Sentence_SA_train_1.csv"
path_to_sentiment_validation = "Train_Only_Sentence_SA_dev_1.csv"
path_to_sentiment_test = "Train_Only_Sentence_SA_test_1.csv"

path_to_sts_train = "Train_Only_Sentence_STS_train_1.csv"
path_to_sts_validation = "Train_Only_Sentence_STS_dev_1.csv"
path_to_sts_test = "Train_Only_Sentence_STS_test_1.csv"

path_to_paraphrase_train = "Train_Only_Sentence_Para_train_1.csv"
path_to_paraphrase_validation = "Train_Only_Sentence_Para_dev_1.csv"
path_to_paraphrase_test = "Train_Only_Sentence_Para_test_1.csv"

path_to_qg_train = "Train_Only_Sentence_QG_train_1.csv"
path_to_qg_validation = "Train_Only_Sentence_QG_dev_1.csv"
path_to_qg_test = "Train_Only_Sentence_QG_test_1.csv"

path_to_summarization_train = "Train_Only_Sentence_TextSum_train_1.csv"
path_to_summarization_validation = "Train_Only_Sentence_TextSum_dev_1.csv"
path_to_summarization_test = "Train_Only_Sentence_TextSum_test_1.csv"

path_to_title_gen_train = "Train_Only_Sentence_Title_train_1.csv"
path_to_title_gen_validation = "Train_Only_Sentence_Title_dev_1.csv"
path_to_title_gen_test = "Train_Only_Sentence_Title_test_1.csv"

In [4]:
dataset_dict = {
    "sentiment": load_csv_dataset(
        path_to_sentiment_train,
        path_to_sentiment_validation,
        path_to_sentiment_test,
        sentiment_columns
    ),
    "sts": load_csv_dataset(
        path_to_sts_train,
        path_to_sts_validation,
        path_to_sts_test,
        sts_columns
    ),
    "paraphrase": load_csv_dataset(
        path_to_paraphrase_train,
        path_to_paraphrase_validation,
        path_to_paraphrase_test,
        paraphrase_columns
    ),
    "qg": load_csv_dataset(
        path_to_qg_train,
        path_to_qg_validation,
        path_to_qg_test,
        qg_columns
    ),
    "text_summarization": load_csv_dataset(
        path_to_summarization_train,
        path_to_summarization_validation,
        path_to_summarization_test,
        summarization_columns
    ),
    "title_gen": load_csv_dataset(
        path_to_title_gen_train,
        path_to_title_gen_validation,
        path_to_title_gen_test,
        title_gen_columns
    ),
}

In [5]:
dataset_dict

{'sentiment': {'train': Dataset({
      features: ['id', 'text', 'sentiment'],
      num_rows: 20
  }),
  'validation': Dataset({
      features: ['id', 'text', 'sentiment'],
      num_rows: 6
  }),
  'test': Dataset({
      features: ['id', 'text', 'sentiment'],
      num_rows: 6
  })},
 'sts': {'train': Dataset({
      features: ['sentence1', 'sentence2', 'similarity_score'],
      num_rows: 19
  }),
  'validation': Dataset({
      features: ['sentence1', 'sentence2', 'similarity_score'],
      num_rows: 6
  }),
  'test': Dataset({
      features: ['sentence1', 'sentence2', 'similarity_score'],
      num_rows: 6
  })},
 'paraphrase': {'train': Dataset({
      features: ['sentence1', 'sentence2'],
      num_rows: 20
  }),
  'validation': Dataset({
      features: ['sentence1', 'sentence2'],
      num_rows: 6
  }),
  'test': Dataset({
      features: ['sentence1', 'sentence2'],
      num_rows: 6
  })},
 'qg': {'train': Dataset({
      features: ['id', 'text', 'predicted_question'],
   

In [6]:
for task_name, dataset in dataset_dict.items():
    print(task_name)
    print(dataset_dict[task_name]["train"][:2])
    print()

sentiment
{'id': [1, 2], 'text': ['Один из создателей сервиса шестисекундных видеороликов и приложения онлайн-викторины HQ Trivia Колин Кролл умер от передозировки наркотиками в возрасте 35 лет.', 'Об этом сообщает портал TMZ со ссылкой на источник в полиции.'], 'sentiment': ['positive', 'positive']}

sts
{'sentence1': ['Один из создателей сервиса шестисекундных видеороликов и приложения онлайн-викторины HQ Trivia Колин Кролл умер от передозировки наркотиками в возрасте 35 лет.', 'Об этом сообщает портал TMZ со ссылкой на источник в полиции.'], 'sentence2': ['Один из создателей сервиса шестисекундных видеороликов и приложения онлайн-викторины HQ Trivia скончался от передозировки наркотиками.', 'TMZ со ссылкой на источник сообщил о задержании подозреваемого в теракте в Петербурге.'], 'similarity_score': [4.36, 2.68]}

paraphrase
{'sentence1': ['Один из создателей сервиса шестисекундных видеороликов и приложения онлайн-викторины HQ Trivia Колин Кролл умер от передозировки наркотиками в в

In [26]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

def tokenize_for_sentiment(dataset):
    # Токенизация для задачи анализа тональности
    encodings = tokenizer(dataset['text'], truncation=True, padding=True)
    return encodings

def tokenize_for_sts(dataset):
    # Токенизация для задачи Semantic Text Similarity
    encodings = tokenizer(dataset['sentence1'], dataset['sentence2'], truncation=True, padding=True)
    return encodings

def tokenize_for_paraphrase(dataset):
    # Токенизация для задачи Paraphrase Identification
    encodings = tokenizer(dataset['sentence1'], dataset['sentence2'], truncation=True, padding=True)
    return encodings

def tokenize_for_qg(dataset):
    # Токенизация для задачи Question Generation
    encodings = tokenizer(dataset['text'], truncation=True, padding=True)
    return encodings

def tokenize_for_text_summarization(dataset):
    # Токенизация для задачи Text Summarization
    encodings = tokenizer(dataset['generated_text'], truncation=True, padding=True)
    return encodings

def tokenize_for_title_gen(dataset):
    # Токенизация для задачи Title Generation
    encodings = tokenizer(dataset['text'], truncation=True, padding=True)
    return encodings

# Затем, примените соответствующую функцию токенизации к каждому поддатасету
encoded_datasets = {}
encoded_datasets['sentiment'] = {split: tokenize_for_sentiment(dataset) for split, dataset in dataset_dict['sentiment'].items()}
encoded_datasets['sts'] = {split: tokenize_for_sts(dataset) for split, dataset in dataset_dict['sts'].items()}
encoded_datasets['paraphrase'] = {split: tokenize_for_paraphrase(dataset) for split, dataset in dataset_dict['paraphrase'].items()}
encoded_datasets['qg'] = {split: tokenize_for_qg(dataset) for split, dataset in dataset_dict['qg'].items()}
encoded_datasets['text_summarization'] = {split: tokenize_for_text_summarization(dataset) for split, dataset in dataset_dict['text_summarization'].items()}
encoded_datasets['title_gen'] = {split: tokenize_for_title_gen(dataset) for split, dataset in dataset_dict['title_gen'].items()}


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [19]:
from transformers import BertModel, BertConfig, AutoModel
import torch.nn as nn

class MultiTaskModel(nn.Module):
    def __init__(self, config):
        super(MultiTaskModel, self).__init__()

        self.bert = BertModel.from_pretrained('DeepPavlov/rubert-base-cased', config=config)
        
        # Выходные слои для каждой задачи
        self.sentiment_output = nn.Linear(config.hidden_size, 3) # Sentiment Analysis
        self.sts_output = nn.Linear(config.hidden_size, 1) # STS
        self.qg_output = nn.Linear(config.hidden_size, config.vocab_size) # QG
        self.title_cond_text_output = nn.Linear(config.hidden_size, config.vocab_size) # Title conditioned text generation
        self.summarization_output = nn.Linear(config.hidden_size, config.vocab_size) # Text summarization
        self.paraphrase_output = nn.Linear(config.hidden_size, config.vocab_size) # Paraphrase generation

    def forward(self, task_name, input_ids, attention_mask, token_type_ids=None):
        bert_output = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = bert_output.pooler_output
        
        if task_name == "sentiment":
            output = self.sentiment_output(pooled_output)
        elif task_name == "sts":
            output = self.sts_output(pooled_output)
        elif task_name == "qg":
            output = self.qg_output(pooled_output)
        elif task_name == "title_gen":
            output = self.title_cond_text_output(pooled_output)
        elif task_name == "text_summarization":
            output = self.summarization_output(pooled_output)
        elif task_name == "paraphrase":
            output = self.paraphrase_output(pooled_output)
        else:
            raise ValueError("Invalid task name")
        
        return output

from transformers import BertModel   

config = BertConfig.from_pretrained('DeepPavlov/rubert-base-cased')
config.output_hidden_states = True
config.output_attentions = True

multi_task_model = MultiTaskModel(config)

from torch.utils.data import Dataset

class TaskDataset(Dataset):
    def __init__(self, data, tokenizer, max_seq_length, task):
        self.data = data
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length
        self.task = task
        self.encoded_inputs = self.tokenize_task_data(data, task)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data = self.data[idx]
        encoded_input = self.encoded_inputs[self.task][idx]
        item = {key: torch.tensor(val) for key, val in encoded_input.items()}
        item['id'] = data['id']
        return item

    def tokenize_task_data(self, data, task):
        if task == 'sentiment':
            encodings = tokenize_for_sentiment(data)
        elif task == 'sts':
            encodings = tokenize_for_sts(data)
        elif task == 'paraphrase':
            encodings = tokenize_for_paraphrase(data)
        elif task == 'qg':
            encodings = tokenize_for_qg(data)
        elif task == 'text_summarization':
            encodings = tokenize_for_text_summarization(data)
        elif task == 'title_gen':
            encodings = tokenize_for_title_gen(data)
        else:
            raise ValueError("Invalid task name")
        return {task: encodings}


    
# Функция для получения DataLoader для задачи
def get_train_dataloader(task, batch_size=1, max_seq_length=128):
    train_data = dataset_dict[task]['train']
    dataset = TaskDataset(train_data, tokenizer, max_seq_length, task)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    return dataloader

def get_val_dataloader(task, batch_size=1, max_seq_length=128):
    val_data = dataset_dict[task]['validation']
    dataset = TaskDataset(val_data, tokenizer, max_seq_length, task)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0)
    return dataloader

def get_test_dataloader(task, batch_size=1, max_seq_length=128):
    test_data = dataset_dict[task]['test']
    dataset = TaskDataset(test_data, tokenizer, max_seq_length, task)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0)
    return dataloader

from transformers import BertConfig, AdamW, get_linear_schedule_with_warmup

class TaskSpecificModel(MultiTaskModel):
    def __init__(self, config, num_labels):
        super().__init__(config)
        self.num_labels = num_labels
        self.dropout = nn.Dropout(0.1)

    def forward(self, task_name, input_ids, attention_mask, token_type_ids=None):
        bert_output = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = bert_output.pooler_output
        pooled_output = self.dropout(pooled_output)
        
        if task_name == "sentiment":
            output = self.sentiment_output(pooled_output)
        elif task_name == "sts":
            output = self.sts_output(pooled_output)
        elif task_name == "qg":
            output = self.qg_output(pooled_output)
        elif task_name == "title_gen":
            output = self.title_cond_text_output(pooled_output)
        elif task_name == "text_summarization":
            output = self.summarization_output(pooled_output)
        elif task_name == "paraphrase":
            output = self.paraphrase_output(pooled_output)
        else:
            raise ValueError("Invalid task name")
        
        return output

        

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
# Создание словаря меток для каждой из задач
task_num_labels = {
    "sts": 1,
    "sentiment": 3,
    "paraphrase": 1,
    "text_summarization": 1,
    "title_gen": 1,
    "qg": 1
    
}

# Создание конфигураций модели для каждой задачи
task_configs = {
    "sts": BertConfig.from_pretrained('DeepPavlov/rubert-base-cased', num_hidden_layers=6, num_attention_heads=6),
    "sentiment": BertConfig.from_pretrained('DeepPavlov/rubert-base-cased', num_hidden_layers=6, num_attention_heads=6),
    "paraphrase": BertConfig.from_pretrained('DeepPavlov/rubert-base-cased', num_hidden_layers=6, num_attention_heads=6),
    "text_summarization": BertConfig.from_pretrained('DeepPavlov/rubert-base-cased', num_hidden_layers=6, num_attention_heads=6),
    "qg": BertConfig.from_pretrained('DeepPavlov/rubert-base-cased', num_hidden_layers=6, num_attention_heads=6),
    "title_gen": BertConfig.from_pretrained('DeepPavlov/rubert-base-cased', num_hidden_layers=6, num_attention_heads=6)
    
}

# Создание экземпляров модели для каждой задачи с задаче-специфическими гиперпараметрами
task_models = {task: TaskSpecificModel(config, task_num_labels[task]) for task, config in task_configs.items()}

# Настройка параметров оптимизатора и планировщика для каждой задачи
task_specific_learning_rates = {
    "sts": 3e-5,
    "sentiment": 3e-5,
    "paraphrase": 3e-5,
    "sentiment": 3e-5,
    "text_summarization": 3e-5,
    "title_gen": 3e-5,
    "qg": 3e-5
}

# Создание оптимизаторов для каждой задачи
task_optimizers = {task: AdamW(model.parameters(), lr=lr) for task, (model, lr) in zip(task_models.keys(), zip(task_models.values(), task_specific_learning_rates.values()))}

# Создание планировщиков для каждой задачи
num_training_steps = 1000  
num_warmup_steps = int(0.1 * num_training_steps)

task_schedulers = {task: get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps) for task, optimizer in task_optimizers.items()}

from transformers import AutoTokenizer
from torch.utils.data import DataLoader

tokenizer = AutoTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

batch_size = 1
num_epochs = 3
max_seq_length = 128

train_dataloaders = {task: get_train_dataloader(task, batch_size, max_seq_length) for task in task_models.keys()}

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['bert.encoder.layer.8.output.LayerNorm.weight', 'bert.encoder.layer.9.output.LayerNorm.bias', 'bert.encoder.layer.6.attention.output.LayerNorm.weight', 'bert.encoder.layer.11.attention.self.value.weight', 'bert.encoder.layer.10.output.dense.bias', 'cls.predictions.decoder.bias', 'bert.encoder.layer.7.attention.self.query.weight', 'bert.encoder.layer.9.output.dense.weight', 'bert.encoder.layer.11.attention.self.value.bias', 'bert.encoder.layer.11.output.dense.bias', 'bert.encoder.layer.7.output.LayerNorm.weight', 'bert.encoder.layer.7.attention.self.query.bias', 'bert.encoder.layer.8.output.LayerNorm.bias', 'bert.encoder.layer.8.attention.output.dense.weight', 'bert.encoder.layer.9.attention.output.LayerNorm.bias', 'bert.encoder.layer.10.attention.output.LayerNorm.weight', 'bert.encoder.layer.10.output.LayerNorm.weight', 'bert.encoder.layer.7.attention.output.dense.weight', '

In [21]:
train_dataloaders

{'sts': <torch.utils.data.dataloader.DataLoader at 0x1e758123940>,
 'sentiment': <torch.utils.data.dataloader.DataLoader at 0x1e772fafb80>,
 'paraphrase': <torch.utils.data.dataloader.DataLoader at 0x1e772fafcd0>,
 'text_summarization': <torch.utils.data.dataloader.DataLoader at 0x1e772fafd60>,
 'qg': <torch.utils.data.dataloader.DataLoader at 0x1e77828a910>,
 'title_gen': <torch.utils.data.dataloader.DataLoader at 0x1e778293af0>}

In [22]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, mean_squared_error

# Определение функции потерь для каждой задачи
task_criterion = {
    "sts": nn.CrossEntropyLoss(),
    "sentiment": nn.CrossEntropyLoss(),
    "qg": nn.CrossEntropyLoss(),
    "paraphrase": nn.CrossEntropyLoss(),
    "title_gen": nn.CrossEntropyLoss(),
    "text_summarization": nn.CrossEntropyLoss()
}

In [23]:
# Создание DataLoader для каждой задачи
batch_size = 1
data_collator = DataCollatorWithPadding(tokenizer)
train_dataloader = {task: DataLoader(dataset_dict[task]['train'], batch_size=batch_size, collate_fn=data_collator) for task in task_models}
val_dataloader = {task: DataLoader(dataset_dict[task]['validation'], batch_size=batch_size, collate_fn=data_collator) for task in task_models}

In [24]:
# Функция для обучения модели
def train(model, dataloader, optimizer, scheduler, criterion, device):
    model.train()
    total_loss = 0.0
    
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(dataloader)
    return avg_loss

# Функция для оценки модели
def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0.0
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            
            predictions = torch.argmax(outputs, dim=-1)
            all_predictions.extend(predictions.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())
    
    avg_loss = total_loss / len(dataloader)
    return avg_loss, all_predictions, all_labels

In [25]:
num_epochs = 3

for task in task_models:
    print(f"Training and evaluating model for task: {task}")
    model = task_models[task].to(device)
    optimizer = task_optimizers[task]
    scheduler = task_schedulers[task]
    criterion = task_criterion[task]

    for epoch in range(num_epochs):
        train_loss = train(model, train_dataloader[task], optimizer, scheduler, criterion, device)
    val_loss, val_predictions, val_labels = evaluate(model, val_dataloader[task], criterion, device)

    print(f"Epoch: {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

Training and evaluating model for task: sts


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['sentence1', 'sentence2', 'similarity_score']

In [None]:
for task in task_models:
    print(f"Training and evaluating model for task: {task}")
    model = task_models[task].to(device)
    optimizer = task_optimizers[task]
    scheduler = task_schedulers[task]
    criterion = task_criterion[task]

    for epoch in range(num_epochs):
        train_loss = train(model, train_dataloader[task], optimizer, scheduler, criterion, device)
    val_loss, val_predictions, val_labels = evaluate(model, val_dataloader[task], criterion, device)

    print(f"Epoch: {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

In [None]:
# Вычисление метрик качества для каждой задачи
if task in ["ner", "sentiment]:
    accuracy = accuracy_score(val_labels, val_predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(val_labels, val_predictions, average='weighted')
    print(f"Task: {task}, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")
elif task == "sts":
    mse = mean_squared_error(val_labels, val_predictions)
    print(f"Task: {task}, MSE: {mse:.4f}")
else:
    print(f"Invalid task: {task}")

print("Training and evaluation completed.")