# Обучите модель классификации букв для задачи расстановки ударения с помощью методов из библиотеки transformers.
- Датасет для обучения можно взять отсюда: https://github.com/Koziev/NLP_Datasets/blob/master/Stress/all_accents.zip
* Напишите класс для Dataset/Dataloder и азбейте данные на случайные train / test сплиты в соотношении 50:50. (1 балл)
* Попробуйте несколько моделей: Bert, Albert, Deberta. (3 балла) Пример конфигурации для deberta: https://huggingface.co/IlyaGusev/ru-word-stress-transformer/blob/main/config.json

## Загрузка и разбиением датасета

In [1]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
import random
from sklearn.model_selection import train_test_split

In [72]:
df = pd.read_csv("/kaggle/input/all-accent/all_accents.tsv", sep="\t", header=None, names=["word", "accent"])
df = df.sample(n=400000, random_state=42)

In [73]:
df

Unnamed: 0,word,accent
1484615,теплохода,теплох^ода
392900,задавалам,задав^алам
988387,перерождающая,перерожд^ающая
724329,намолотивший,намолот^ивший
1653846,эволюционировавшими,эволюцион^ировавшими
...,...,...
158688,виртуализируемся,виртуализ^ируемся
1657121,экранными,экр^анными
1154810,приготовляются,приготовл^яются
1154880,пригребалась,пригреб^алась


In [74]:
train_words, test_words, train_accent, test_accent = train_test_split(df['word'], df['accent'], test_size=0.5, random_state=42)

In [75]:
train_words.shape, test_words.shape, train_accent.shape, test_accent.shape

((200000,), (200000,), (200000,), (200000,))

In [76]:
longest_word = max(df["word"], key=len)
print(f"Слово: {longest_word} , Длинна: {len(longest_word)}")

Слово: лланвайрпуллгуингиллгогерихуирндробуллллантисилиогогогох , Длинна: 56


In [77]:
class AccentDataset(Dataset):
    def __init__(self, words, accent_words, tokenizer, max_length=64):
        self.words = words
        self.accent_words = accent_words
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.words)

    def __getitem__(self, idx):
        word = self.words[idx]
        accent_word = self.accent_words[idx]
        
        encoded = self.tokenizer(word, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        labels = [0] * self.max_length
        
        for i, (c1, c2) in enumerate(zip(word, accent_word)):
            if c1 != c2:
                labels[i] = 1
                break
        
        return {
            'input_ids': encoded['input_ids'].squeeze(0),
            'attention_mask': encoded['attention_mask'].squeeze(0),
            'labels': torch.tensor(labels, dtype=torch.long)
        }

## Тестирование разных моделей

In [78]:
from transformers import AutoModelForTokenClassification, AutoTokenizer, TrainingArguments, Trainer

## Bert Albert Debert

In [79]:
model_names = ["DeepPavlov/rubert-base-cased", "albert-base-v2", "microsoft/deberta-v3-base"]

In [80]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)

    mask = labels != -100
    predictions = predictions[mask]
    labels = labels[mask]

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="macro")

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [81]:
def train_model(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    train_dataset = AccentDataset(train_words.tolist(), train_accent.tolist(), tokenizer)
    test_dataset = AccentDataset(test_words.tolist(), test_accent.tolist(), tokenizer)
    
    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=2)
    
    training_args = TrainingArguments(
        output_dir="./results",
        eval_strategy="epoch",
        save_strategy="epoch",
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=3,
        weight_decay=0.01,
        fp16=True,
        logging_steps=10,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        processing_class=tokenizer,
        compute_metrics=compute_metrics
    )
    
    trainer.train()
    metrics = trainer.evaluate() 
    return tokenizer, model, metrics

In [82]:
trained_models = {}
tokenizer_models = {}
model_metrics = {}

In [83]:
for model_name in model_names:
    print(f"Training {model_name}...")
    tokenizer_models[model_name], trained_models[model_name], model_metrics[model_name] = train_model(model_name)

Training DeepPavlov/rubert-base-cased...


Some weights of BertForTokenClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0231,0.021278,0.99148,0.9012,0.796513,0.840735
2,0.014,0.014621,0.994429,0.922307,0.890771,0.905906
3,0.0103,0.013644,0.995331,0.930976,0.91457,0.922608


Training albert-base-v2...


Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0498,0.050043,0.98438,0.49219,0.5,0.496064
2,0.0498,0.050008,0.98438,0.49219,0.5,0.496064
3,0.0505,0.04998,0.98438,0.49219,0.5,0.496064


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  _warn_prf(average, modifier, msg_start, len(result))


Training microsoft/deberta-v3-base...


Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0259,0.023884,0.993437,0.916966,0.858067,0.885201
2,0.0219,0.020867,0.994477,0.938226,0.873181,0.903009
3,0.0182,0.020071,0.994889,0.94585,0.880161,0.910307


In [84]:
for model, metrics in model_metrics.items():
    print(f"Metrics {model_name}...")
    print(f"{model}: {metrics}")

Metrics microsoft/deberta-v3-base...
DeepPavlov/rubert-base-cased: {'eval_loss': 0.013644049875438213, 'eval_accuracy': 0.995330546875, 'eval_precision': 0.9309755488075154, 'eval_recall': 0.9145696458884888, 'eval_f1': 0.9226078621069406, 'eval_runtime': 413.2664, 'eval_samples_per_second': 483.949, 'eval_steps_per_second': 15.123, 'epoch': 3.0}
Metrics microsoft/deberta-v3-base...
albert-base-v2: {'eval_loss': 0.049980439245700836, 'eval_accuracy': 0.9843796875, 'eval_precision': 0.49218984375, 'eval_recall': 0.5, 'eval_f1': 0.49606418252555307, 'eval_runtime': 474.3702, 'eval_samples_per_second': 421.612, 'eval_steps_per_second': 13.175, 'epoch': 3.0}
Metrics microsoft/deberta-v3-base...
microsoft/deberta-v3-base: {'eval_loss': 0.02007102221250534, 'eval_accuracy': 0.994889296875, 'eval_precision': 0.9458500941134138, 'eval_recall': 0.8801612871969267, 'eval_f1': 0.9103067739677138, 'eval_runtime': 541.4066, 'eval_samples_per_second': 369.408, 'eval_steps_per_second': 11.544, 'epoch

In [85]:
for model_name, tokenizer in tokenizer_models.items():
    test_word = "корова"

    model = trained_models[model_name].to(device)

    inputs = tokenizer(test_word, padding='max_length', truncation=True, max_length=64, return_tensors='pt').to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        
    predictions = torch.argmax(outputs.logits, dim=-1).cpu()

    print(f"Prediction of model {model_name}: {predictions}")

Prediction of model DeepPavlov/rubert-base-cased: tensor([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
Prediction of model albert-base-v2: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
Prediction of model microsoft/deberta-v3-base: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
