In [103]:
!pip install --quiet nlp==0.2.0

from transformers import BertTokenizer, BertForMaskedLM
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
import transformers
import nlp
from datasets import load_dataset, Dataset
import matplotlib.pyplot as plt
import logging
logging.basicConfig(level=logging.CRITICAL)

import json
from tqdm.notebook import tqdm

tqdm.pandas()

import pandas as pd
from datasets import Dataset

def load_csv_dataset(train_path, val_path, test_path, column_names, encoding="utf-8-sig"):
    train_data = pd.read_csv(train_path, encoding=encoding, sep=';')
    val_data = pd.read_csv(val_path, encoding=encoding, sep=';')
    test_data = pd.read_csv(test_path, encoding=encoding, sep=';')

    train_data.columns = column_names
    val_data.columns = column_names
    test_data.columns = column_names

    train_dataset = Dataset.from_pandas(train_data)
    val_dataset = Dataset.from_pandas(val_data)
    test_dataset = Dataset.from_pandas(test_data)

    return {"train": train_dataset, "validation": val_dataset, "test": test_dataset}

ner_columns = ["text", "ner"]
sentiment_columns = ["id", "text", "sentiment", "sentiment_1"]

In [104]:
path_to_ner_train = "Train_NER_FULL_train.csv"
path_to_ner_validation = "Train_NER_FULL_dev.csv"
path_to_ner_test = "Train_NER_FULL_test.csv"

path_to_sentiment_train = "Train_Only_Sentence_SA_train_1_1.csv"
path_to_sentiment_validation = "Train_Only_Sentence_SA_test_1_1.csv"
path_to_sentiment_test = "Train_Only_Sentence_SA_test_1_1.csv"

In [105]:
dataset_dict = {
    "ner": load_csv_dataset(
        path_to_ner_train,
        path_to_ner_validation,
        path_to_ner_test,
        ner_columns
    ),
    "sentiment": load_csv_dataset(
        path_to_sentiment_train,
        path_to_sentiment_validation,
        path_to_sentiment_test,
        sentiment_columns
    )
}

In [106]:
dataset_dict

{'ner': {'train': Dataset({
      features: ['text', 'ner'],
      num_rows: 9999
  }),
  'validation': Dataset({
      features: ['text', 'ner'],
      num_rows: 1000
  }),
  'test': Dataset({
      features: ['text', 'ner'],
      num_rows: 1000
  })},
 'sentiment': {'train': Dataset({
      features: ['id', 'text', 'sentiment', 'sentiment_1'],
      num_rows: 9999
  }),
  'validation': Dataset({
      features: ['id', 'text', 'sentiment', 'sentiment_1'],
      num_rows: 999
  }),
  'test': Dataset({
      features: ['id', 'text', 'sentiment', 'sentiment_1'],
      num_rows: 999
  })}}

In [107]:
for task_name, dataset in dataset_dict.items():
    print(task_name)
    print(dataset_dict[task_name]["train"][:5])
    print()

ner
{'text': ['На', 'севере', 'граничит', 'с', 'Латвией'], 'ner': ['O', 'O', 'O', 'O', 'I-LOC']}

sentiment
{'id': [1, 2, 3, 4, 5], 'text': ['Один из создателей сервиса шестисекундных видеороликов и приложения онлайн-викторины HQ Trivia Колин Кролл умер от передозировки наркотиками в возрасте 35 лет.', 'Об этом сообщает портал TMZ со ссылкой на источник в полиции.', 'По информации издания, утром в воскресенье, 16 декабря, девушка Кролла позвонила в полицию, заявив, что не может с ним связаться.', 'После этого сотрудники правоохранительных органов приехали в квартиру в центре Манхэттена и обнаружили его тело в спальне.', 'В комнате были найдены также следы наркотиков и устройства для их употребления.'], 'sentiment': ['positive', 'positive', 'neutral', 'negative', 'neutral'], 'sentiment_1': [2, 2, 0, 1, 0]}



In [138]:
from transformers import DataCollatorForTokenClassification

In [166]:
from transformers import BertModel, BertConfig, AutoModel
import torch.nn as nn

class MultiTaskModel(nn.Module):
    def __init__(self, config, num_ner_labels, num_sentiment_labels):
        super(MultiTaskModel, self).__init__()
        
        # Общая часть модели для всех задач
        self.bert = BertModel.from_pretrained('DeepPavlov/rubert-base-cased', config=config)
        
        # Выходные слои для каждой задачи
        self.ner_output = nn.Linear(config.hidden_size, num_ner_labels)
        self.sentiment_output = nn.Linear(config.hidden_size, num_sentiment_labels)

    def forward(self, task_name, input_ids, attention_mask, token_type_ids=None):
        bert_output = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = bert_output.pooler_output
        
        if task_name == "ner":
            output = self.ner_output(pooled_output)
        elif task_name == "sentiment":
            output = self.sentiment_output(pooled_output)
        else:
            raise ValueError("Invalid task name")
        
        return output

config = BertConfig.from_pretrained('DeepPavlov/rubert-base-cased')
config.output_hidden_states = True
config.output_attentions = True

num_ner_labels = 9
num_sentiment_labels = 3

multi_task_model = MultiTaskModel(config, num_ner_labels, num_sentiment_labels)

def get_ner_labels(tokens, entities, tokenizer, max_length):
    labels = ['O'] * len(tokens)
    if isinstance(entities, list):
        for entity in entities:
            entity_tokens = tokenizer.tokenize(entity)
            for i in range(len(tokens) - len(entity_tokens) + 1):
                if tokens[i:i + len(entity_tokens)] == entity_tokens:
                    labels[i] = 'B-' + entity.split('-')[0]
                    for j in range(1, len(entity_tokens)):
                        labels[i + j] = 'I-' + entity.split('-')[0]
                    break
    pad_len = max_length - len(labels)
    if pad_len > 0:
        labels.extend(['O'] * pad_len)
    else:
        labels = labels[:max_length]

    return labels


def encode_examples(examples, tokenizer, max_length, task):
    encoded_inputs = {}
    for example in examples:
        text = example.get('text')
        if text is None:
            continue
        if task == 'ner':
            entities = example.get('ner')
            if entities is not None:
                encoded_input = tokenizer(text, truncation=True, padding='max_length', max_length=max_length)
                labels = get_ner_labels(text, entities, tokenizer, max_length)
                encoded_input['labels'] = labels
                encoded_inputs.setdefault('ner', []).append(encoded_input)
        else:
            sentiment = example.get('sentiment_1')
            if sentiment is not None:
                encoded_input = tokenizer(text, truncation=True, padding='max_length', max_length=max_length)
                encoded_input['labels'] = sentiment
                encoded_inputs.setdefault('sentiment', []).append(encoded_input)

    return encoded_inputs


from torch.utils.data import Dataset

class TaskDataset(Dataset):
    def __init__(self, data, tokenizer, max_seq_length, task):
        self.data = data
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length
        self.task = task
        self.encoded_inputs = encode_examples(data, tokenizer, max_seq_length, task)

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        data = self.data[idx]
        encoded_input = self.encoded_inputs[idx]
        item = {key: torch.tensor(val) for key, val in encoded_input.items()}
        item['id'] = data['id']
        return item

        encoding = self.tokenizer(text, truncation=True, max_length=self.max_seq_length, padding="max_length")
        
        if self.task == "ner":
            label_encoding = self.tokenizer(label, truncation=True, max_length=self.max_seq_length, padding="max_length", is_split_into_words=True, return_offsets_mapping=True)
            label_tensor = torch.tensor(label_encoding["input_ids"], dtype=torch.long)
        else:
            # Здесь обрабатываются метки для других задач
            label_tensor = torch.tensor(label, dtype=torch.long)

        # Создание словаря с ключами input_ids, attention_mask и labels
        return {
            "input_ids": torch.tensor(encoding["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(encoding["attention_mask"], dtype=torch.long),
            "labels": label_tensor,
        }
    
# Функция для получения DataLoader для задачи
def get_train_dataloader(task, batch_size=1, max_seq_length=128):
    train_data = dataset_dict[task]['train']
    dataset = TaskDataset(train_data, tokenizer, max_seq_length, task)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    return dataloader

def get_val_dataloader(task, batch_size=1, max_seq_length=128):
    val_data = dataset_dict[task]['validation']
    dataset = TaskDataset(val_data, tokenizer, max_seq_length, task)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0)
    return dataloader

def get_test_dataloader(task, batch_size=1, max_seq_length=128):
    test_data = dataset_dict[task]['test']
    dataset = TaskDataset(test_data, tokenizer, max_seq_length, task)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=0)
    return dataloader

from transformers import BertConfig, AdamW, get_linear_schedule_with_warmup

class TaskSpecificModel(MultiTaskModel):
    def __init__(self, config, num_labels):
        super().__init__(config, num_labels, num_labels)
        

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [167]:
# Создание словаря меток для каждой из задач
task_num_labels = {
    "ner": 9,
    "sentiment": 3
}

# Создание конфигураций модели для каждой задачи
task_configs = {
    "ner": BertConfig.from_pretrained('DeepPavlov/rubert-base-cased', num_hidden_layers=8, num_attention_heads=8),
    "sentiment": BertConfig.from_pretrained('DeepPavlov/rubert-base-cased', num_hidden_layers=8, num_attention_heads=8)
}

# Создание экземпляров модели для каждой задачи с задаче-специфическими гиперпараметрами
task_models = {task: TaskSpecificModel(config, task_num_labels[task]) for task, config in task_configs.items()}

# Настройка параметров оптимизатора и планировщика для каждой задачи
task_specific_learning_rates = {
    "ner": 3e-5,
    "sentiment": 3e-5
}

# Создание оптимизаторов для каждой задачи
task_optimizers = {task: AdamW(model.parameters(), lr=lr) for task, (model, lr) in zip(task_models.keys(), zip(task_models.values(), task_specific_learning_rates.values()))}

# Создание планировщиков для каждой задачи
num_training_steps = 1000  
num_warmup_steps = int(0.1 * num_training_steps)

task_schedulers = {task: get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps) for task, optimizer in task_optimizers.items()}

from transformers import AutoTokenizer
from torch.utils.data import DataLoader

tokenizer = AutoTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

batch_size = 1
num_epochs = 3
max_seq_length = 128

train_dataloaders = {task: get_train_dataloader(task, batch_size, max_seq_length) for task in task_models.keys()}

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['bert.encoder.layer.10.attention.output.dense.bias', 'bert.encoder.layer.8.attention.output.dense.weight', 'bert.encoder.layer.9.output.dense.bias', 'bert.encoder.layer.11.intermediate.dense.weight', 'bert.encoder.layer.8.output.LayerNorm.weight', 'bert.encoder.layer.10.output.dense.bias', 'bert.encoder.layer.9.intermediate.dense.weight', 'bert.encoder.layer.11.attention.self.key.weight', 'bert.encoder.layer.8.attention.self.value.bias', 'bert.encoder.layer.9.attention.self.key.weight', 'bert.encoder.layer.11.attention.self.key.bias', 'cls.predictions.transform.LayerNorm.weight', 'bert.encoder.layer.9.attention.self.query.bias', 'bert.encoder.layer.11.attention.self.value.bias', 'bert.encoder.layer.11.attention.output.dense.bias', 'cls.predictions.transform.dense.bias', 'bert.encoder.layer.8.attention.output.LayerNorm.bias', 'bert.encoder.layer.9.output.LayerNorm.weight', 'b

In [168]:
train_dataloaders

{'ner': <torch.utils.data.dataloader.DataLoader at 0x207fca2fa90>,
 'sentiment': <torch.utils.data.dataloader.DataLoader at 0x20804865fa0>}

In [170]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, mean_squared_error

# Определение функции потерь для каждой задачи
task_criterion = {
    "ner": nn.CrossEntropyLoss(),
    "sentiment": nn.CrossEntropyLoss()
}

In [171]:
# Создание DataLoader для каждой задачи
batch_size = 1
data_collator = DataCollatorWithPadding(tokenizer)
train_dataloader = {task: DataLoader(dataset_dict[task]['train'], batch_size=batch_size, collate_fn=data_collator) for task in task_models}
val_dataloader = {task: DataLoader(dataset_dict[task]['validation'], batch_size=batch_size, collate_fn=data_collator) for task in task_models}

In [172]:
# Функция для обучения модели
def train(model, dataloader, optimizer, scheduler, criterion, device):
    model.train()
    total_loss = 0.0
    for batch in dataloader:
        optimizer.zero_grad()
        item, keys = batch
        input_ids, attention_mask, labels = item[keys[0]].to(device), item[keys[1]].to(device), item[keys[2]].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()

    return total_loss / len(dataloader)

# Функция для оценки модели
def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0.0
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['labels'].to(device)
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            
            predictions = torch.argmax(outputs, dim=-1)
            all_predictions.extend(predictions.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())
    
    avg_loss = total_loss / len(dataloader)
    return avg_loss, all_predictions, all_labels

In [173]:
num_epochs = 3

for task in task_models:
    print(f"Training and evaluating model for task: {task}")
    model = task_models[task].to(device)
    optimizer = task_optimizers[task]
    scheduler = task_schedulers[task]
    criterion = task_criterion[task]

    for epoch in range(num_epochs):
        train_loss = train(model, train_dataloader[task], optimizer, scheduler, criterion, device)
    val_loss, val_predictions, val_labels = evaluate(model, val_dataloader[task], criterion, device)

    print(f"Epoch: {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

Training and evaluating model for task: ner


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['text', 'ner']

In [None]:
for task in task_models:
    print(f"Training and evaluating model for task: {task}")
    model = task_models[task].to(device)
    optimizer = task_optimizers[task]
    scheduler = task_schedulers[task]
    criterion = task_criterion[task]

    for epoch in range(num_epochs):
        train_loss = train(model, train_dataloader[task], optimizer, scheduler, criterion, device)
    val_loss, val_predictions, val_labels = evaluate(model, val_dataloader[task], criterion, device)

    print(f"Epoch: {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

In [None]:
# Вычисление метрик качества для каждой задачи
if task in ["ner", "sentiment]:
    accuracy = accuracy_score(val_labels, val_predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(val_labels, val_predictions, average='weighted')
    print(f"Task: {task}, Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-score: {f1:.4f}")
elif task == "sts":
    mse = mean_squared_error(val_labels, val_predictions)
    print(f"Task: {task}, MSE: {mse:.4f}")
else:
    print(f"Invalid task: {task}")

print("Training and evaluation completed.")