Установим необходимые библиотеки

In [1]:
!pip install peft
!pip install evaluate
!pip install augmentex

Collecting peft
  Downloading peft-0.10.0-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.10.0-py3-none-any.whl (199 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.10.0
Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.1
Collecting augmentex
  Downloading augmentex-1.2.1-py3-none-any.whl.metadata (11 kB)
Downloading augmentex-1.2.1-py3-none-any.whl (22.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.5/22.5 MB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: augmentex
Successfully in

In [2]:
import pandas as pd
import torch
import wandb

from pathlib import Path
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification, AutoModel, AutoTokenizer
from datasets import Dataset
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from safetensors import safe_open
from safetensors.torch import load_model
from evaluate import evaluator
from peft import LoraConfig, get_peft_model
from sklearn.preprocessing import LabelEncoder
from augmentex import WordAug, CharAug

rusentne = Path('/kaggle/input/rusentne')
intents = Path('/kaggle/input/qa-intents-dataset-university-domain')

2024-04-28 11:19:20.420643: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-28 11:19:20.420765: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-28 11:19:20.546934: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Будем работать с данными с [соревнования](https://github.com/dialogue-evaluation/RuSentNE-evaluation) на Kaggle по анализу тональности к именованным сущностям в новостных текстах.

В данных есть заранее размеченные сущности в рамках отдельного предложения и метка тональности (нейтральное, положительное, отрицательное). В качестве метрики качества будем использовать F1-macro с усреднением по положительным и отрицательным меткам.

В качестве модели будем исользовать RuBERT с возможно, некоторыми модификациями.

In [3]:
train_df = pd.read_csv(rusentne / 'train_data.csv', sep='\t')
valid_df = pd.read_csv(rusentne / 'validation_data_labeled.csv', sep='\t')
test_df = pd.read_csv(rusentne / 'final_data.csv', sep='\t')

In [4]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"],
                                 is_split_into_words=True,
                                 return_tensors='pt')
    labels = []

    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            else:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
        labels.append(label_ids)
    tokenized_inputs["start"] = [(torch.argwhere(torch.tensor(labels) != -100)).min().item()]
    tokenized_inputs["end"] = [(torch.argwhere(torch.tensor(labels) != -100)).max().item()]
    tokenized_inputs["labels"] = examples["labels"]
    
    ret = {}
    for k, v in tokenized_inputs.items():
        ret[k] = v[0]
        
    return ret
    
    
def preprocess_datasets(el):
    sentence = el['sentence']
    start = el['entity_pos_start_rel']
    end = el['entity_pos_end_rel']
    sentence = [sentence[:start]] + [sentence[start:end]] + [sentence[end:]]
    label = el['label']
    if label == -1:
        label = 0
    elif label == 0:
        label = 2
    ner_tags = [-100, label, -100]
    d = {'tokens' : [sentence], 'ner_tags' : [ner_tags], 'labels' : [label]}
    
    return tokenize_and_align_labels(d)

tokenizer = AutoTokenizer.from_pretrained('ai-forever/ruBert-base')
columns_remove = ["label", "sentence", 'entity', 'entity_tag', 'entity_pos_start_rel', 'entity_pos_end_rel', 'token_type_ids']
train_dataset = Dataset.from_dict(train_df).map(preprocess_datasets).remove_columns(columns_remove)
val_dataset = Dataset.from_dict(valid_df).map(preprocess_datasets).remove_columns(columns_remove)

config.json:   0%|          | 0.00/590 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

  0%|          | 0/6637 [00:00<?, ?ex/s]

  0%|          | 0/2845 [00:00<?, ?ex/s]

In [5]:
train_dataset[0]

{'input_ids': [101,
  47351,
  1622,
  381,
  151,
  94225,
  687,
  150,
  27165,
  58329,
  380,
  160,
  263,
  27082,
  160,
  76650,
  66365,
  478,
  158,
  98780,
  474,
  33855,
  121,
  5077,
  789,
  100579,
  2568,
  7600,
  55367,
  45670,
  843,
  17693,
  9608,
  10980,
  121,
  750,
  1778,
  3280,
  9338,
  126,
  102],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'start': 0,
 'end': 31,
 'labels': 2}

In [31]:
class RuBertClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.bert = AutoModel.from_pretrained('ai-forever/ruBert-base')
        for param in self.bert.parameters():
            param.requires_grad = False
        self.classifier = nn.Sequential(
                nn.Linear(768, 768),
                nn.ReLU(),
                nn.Linear(768, 3)
        )
    
    def forward(self, input_ids, attention_mask, start, end, labels):
#         input_ids = x['input_ids']
#         attention_mask = x['attention_mask']
#         start = x['start']
#         end = x['end']
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        
        ranges = [torch.arange(start[i], end[i]).int().tolist() for i in range(len(input_ids))]
        
        clsvecs = torch.stack([out[i, ranges[i], :].mean(dim=0) for i in range(len(input_ids))])
        out = self.classifier(clsvecs)
        
        return out

In [41]:
roflan = RuBertClassifier().to('cuda')

Модель обучалась ранее, сохранили чекпоинт

In [8]:
load_model(roflan, '/kaggle/input/notebookfdc54ea563/checkpoint-10790/model.safetensors')

(set(), [])

Добавим модификации в виде LoRA адаптеров

In [14]:
config = LoraConfig(
    r=32,
    lora_alpha=16,
    target_modules=["query", "value"],
    lora_dropout=0.5,
    bias="none",
    modules_to_save=["classifier"],
)
roflan = get_peft_model(roflan, config)
roflan.print_trainable_parameters()

trainable params: 1,772,547 || all params: 180,672,774 || trainable%: 0.9810814107498012


In [10]:
def compute_loss(model, inputs, return_outputs=False):
    labels = inputs.get("labels")
    logits = model(**inputs)
    loss = nn.CrossEntropyLoss()(logits.squeeze(), labels.squeeze())
    if return_outputs:
        return loss, {"logits" : logits}
    return loss

def compute_f1_score(eval_preds):
    preds, targets = eval_preds
    preds = torch.tensor(preds)
    preds = preds.argmax(dim=-1)
    targets = torch.tensor(targets)

    idx = torch.argwhere(targets < 2).squeeze()
    targets = targets[idx]
    preds = preds[idx]
    
    tp = ((preds == targets) * (targets == 1)).sum(axis=-1)
    fn = ((preds != targets) * (targets == 1)).sum(axis=-1)
    fp = ((preds != targets) * (targets == 0)).sum(axis=-1)

    eps = 1e-5
    precision = tp / (tp + fp + eps)
    recall = tp / (tp + fn + eps)

    f1 = 2 * precision * recall / (precision + recall + eps)
#     if log:
#         wandb.log({"f1_score": f1.mean()})

    return {'f1_score' : f1.mean()}

In [44]:
training_args = TrainingArguments(
    '/kaggle/working',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit = 1,
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=30,
    weight_decay=0.01,
    metric_for_best_model="f1_score",
    load_best_model_at_end=True,
    remove_unused_columns=False,
    label_names=['labels'],
    report_to="wandb"
)

trainer = Trainer(
        roflan,
        training_args,                               
        train_dataset=train_dataset,        
        eval_dataset=val_dataset,        
        data_collator=DataCollatorWithPadding(tokenizer),
        compute_metrics=compute_f1_score,
)
trainer.compute_loss = compute_loss

Обучим модель с адаптерами.

In [17]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mxdoni4[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,F1 Score
1,0.6768,0.606728,0.235126
2,0.5563,0.581552,0.280448
3,0.5014,0.573611,0.37653
4,0.4361,0.638662,0.296058
5,0.3856,0.666257,0.428566
6,0.3338,0.706554,0.496058
7,0.2932,0.795841,0.459765
8,0.2681,0.86764,0.416064
9,0.2294,0.950992,0.379209
10,0.2105,0.983571,0.441049


Checkpoint destination directory /kaggle/working/checkpoint-4150 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=24900, training_loss=0.19686709017159948, metrics={'train_runtime': 1683.913, 'train_samples_per_second': 118.242, 'train_steps_per_second': 14.787, 'total_flos': 0.0, 'train_loss': 0.19686709017159948, 'epoch': 30.0})

Наилучший F1 на валидации получился 0.51

In [12]:
wandb.finish()

VBox(children=(Label(value='0.053 MB of 0.053 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/f1_score,▁▂▅▃█
eval/loss,▃▁▁▆█
eval/runtime,▁▂▃▃█
eval/samples_per_second,█▇▆▆▁
eval/steps_per_second,█▇▆▆▁
train/epoch,▁▂▂▃▃▄▄▄▅▆▆▇▇█
train/global_step,▁▂▂▃▃▄▄▅▅▆▆▇▇█
train/grad_norm,▅▄▂▅██▂▁▁
train/learning_rate,█▇▆▅▅▄▃▂▁
train/loss,█▇▆▅▅▃▃▂▁

0,1
eval/f1_score,0.43452
eval/loss,0.69885
eval/runtime,8.542
eval/samples_per_second,333.062
eval/steps_per_second,41.677
train/epoch,5.42
train/global_step,4500.0
train/grad_norm,1.15521
train/learning_rate,0.00025
train/loss,0.2957


In [33]:
roflan = RuBertClassifier().to('cuda')

До этого файнтюнили бейзлайн RuBERT. Дообучали классификатор на выходах модели. Без заморозки весов модель плохо обучается, так как датасет двовольно небольшой. Поэтому в дальнейшем бейзлайн замораживался

In [34]:
load_model(roflan, '/kaggle/input/rusentne-rubert-ft/checkpoint-10790/model.safetensors')

(set(), [])

In [36]:
trainer.evaluate()

{'eval_loss': 0.7416484355926514,
 'eval_f1_score': 0.4456183612346649,
 'eval_runtime': 7.9429,
 'eval_samples_per_second': 358.182,
 'eval_steps_per_second': 44.82}

Результат получился хуже чем для модели с адаптерами. Вообще обучение было очень нестабильным во всех случаях. Модель сильно переобучалась в плане лосса: на протяжении всего обучения лосс на трейне падал, а на валидации рос. При этом это не влияло сильно на метрику на валидации (она просто флуктуировала в районе 30-50%), то есть, несмотря на переобучение, модель выучивала необходимые зависимости из трейна. Если решить проблемы с переобучением, то возможно получились бы результаты еще лучше. Но датасет увеличить не получится, а как эффективно аугментировать данные для задачи непонятно, возможно с майнингом данных с помощью обращения к LLM.

Теперь будем работать над intent classification

Будем работать с [данными](https://www.kaggle.com/datasets/constantinwerner/qa-intents-dataset-university-domain), которые собраны Новосибирским Государственным Университетом для своего QA-чатбота. Выбор в большой степени обусловлен именно "студенческим доменом". Описание гласит, что датасет содержит 142 интента (класса) и порядка 50-220 фраз на русском языке для каждого.

In [3]:
train_df = pd.read_csv(intents / 'dataset_train.tsv', sep='\t', names=['phrase', 'intent'])
valid_df = pd.read_csv(intents / 'dataset_test.tsv', sep='\t', names=['phrase', 'intent'])

Размеры датасетов

In [4]:
len(train_df), len(valid_df)

(13230, 883)

Примеров в среднем на класс

In [5]:
train_df.groupby('intent').count()["phrase"].mean()

93.16901408450704

In [6]:
train_df['phrase'].str.split().apply(len).mean()

4.046636432350718

Тексты короткие, в основном словосочетания типа "мне нужно X", "где взять X"

In [7]:
def preprocess_datasets(el):
    phrase = el['phrase']
    intent = el['intent']
    
    labels = le.transform([intent]).item()
    ret = tokenizer(phrase)
    ret['labels'] = labels
    
    return ret

tokenizer = AutoTokenizer.from_pretrained('ai-forever/ruBert-base')
le = LabelEncoder().fit(train_df['intent'].unique().tolist())
columns_remove = ["phrase", "intent", "token_type_ids"]
train_dataset = Dataset.from_dict(train_df).map(preprocess_datasets).remove_columns(columns_remove)
val_dataset = Dataset.from_dict(valid_df).map(preprocess_datasets).remove_columns(columns_remove)

config.json:   0%|          | 0.00/590 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

  0%|          | 0/13230 [00:00<?, ?ex/s]

  0%|          | 0/883 [00:00<?, ?ex/s]

Для начала обучим обычный RuBERT

In [8]:
class RuBertIntentsClassifier(nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        
        self.bert = AutoModel.from_pretrained('ai-forever/ruBert-base')
        for param in self.bert.parameters():
            param.requires_grad = False
        self.classifier = nn.Sequential(
                nn.Linear(768, 768),
                nn.ReLU(),
                nn.Linear(768, n_classes)
        )
    
    def forward(self, input_ids, attention_mask, labels):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        out = self.classifier(out.mean(dim=1))
        
        return out

In [36]:
roflan = RuBertIntentsClassifier(len(le.classes_))

In [9]:
def compute_loss(model, inputs, return_outputs=False):
    labels = inputs.get("labels")
    logits = model(**inputs)
    loss = nn.CrossEntropyLoss()(logits.squeeze(), labels.squeeze())
    if return_outputs:
        return loss, {"logits" : logits}
    return loss

def compute_accuracy(eval_preds):
    preds, targets = eval_preds
    preds = torch.tensor(preds)
    preds = preds.argmax(dim=-1)
    targets = torch.tensor(targets)
    
    accuracy = (preds == targets).float()

    return {'accuracy' : accuracy.mean()}

In [9]:
training_args = TrainingArguments(
    '/kaggle/working',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit = 1,
    learning_rate=3e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=50,
    weight_decay=0.01,
#     lr_scheduler_type='cosine',
    metric_for_best_model="accuracy",
    load_best_model_at_end=True,
    remove_unused_columns=False,
    label_names=['labels'],
    report_to="wandb"
)

trainer = Trainer(
        roflan,
        training_args,                             
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=DataCollatorWithPadding(tokenizer),
        compute_metrics=compute_accuracy,
)
trainer.compute_loss = compute_loss

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Эмбеддинги RuBERT информативные, поэтому получается обучая только классификатор добиться аккураси 94% на валидации. В то же время обучение всей модели целиком было бы затруднительно в силу небольших размеров датасета.

In [40]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.252835,0.718007
2,2.348800,0.645431,0.839185
3,0.764300,0.429534,0.884485
4,0.443200,0.366865,0.884485
5,0.336000,0.31446,0.902605
6,0.336000,0.271009,0.906002
7,0.273100,0.259468,0.912797
8,0.238200,0.242494,0.91393
9,0.215900,0.230208,0.916195
10,0.198800,0.216452,0.919592


TrainOutput(global_step=20700, training_loss=0.22130508901992282, metrics={'train_runtime': 657.9526, 'train_samples_per_second': 1005.392, 'train_steps_per_second': 31.461, 'total_flos': 0.0, 'train_loss': 0.22130508901992282, 'epoch': 50.0})

In [41]:
trainer.evaluate()

{'eval_loss': 0.1635701060295105,
 'eval_accuracy': 0.9433748722076416,
 'eval_runtime': 0.5793,
 'eval_samples_per_second': 1524.279,
 'eval_steps_per_second': 48.335,
 'epoch': 50.0}

In [42]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.024 MB uploaded\r'), FloatProgress(value=0.05482802145787315, max=1.…

0,1
eval/accuracy,▁▅▆▆▇▇▇▇▇▇▇▇▇███████████████████████████
eval/loss,█▄▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▇█▅▃▅▄▃▂▂▄▁▃▄▂▂▃▂▃▃▃▃▃▂▂▂▃▄▂▂▃▂▂▂▂▄▃▃▃▆▇
eval/samples_per_second,▂▁▄▆▄▅▆▇▇▅█▆▅▇▇▆▇▆▆▆▆▆▇▇▇▆▅▇▇▆▇▇▇▇▅▅▆▆▃▂
eval/steps_per_second,▂▁▄▆▄▅▆▇▇▅█▆▅▇▇▆▇▆▆▆▆▆▇▇▇▆▅▇▇▆▇▇▇▇▅▅▆▆▃▂
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,█▆▇▆▃▂▄▃▃▅▃▂▆▄▄▂▁▃▂▄▄▂▃▄▃▂▄▄▄▃▃▃▃▃▃▄▅▂▆▃
train/learning_rate,███▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▁▁
train/loss,█▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/accuracy,0.94337
eval/loss,0.16357
eval/runtime,0.5793
eval/samples_per_second,1524.279
eval/steps_per_second,48.335
train/epoch,50.0
train/global_step,20700.0
train/grad_norm,1.18078
train/learning_rate,0.0
train/loss,0.1094


Теперь реализуемый сетап, в котором можно добавлять новые классы

In [81]:
class RuBertIntentsClassifierWithAddition(nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        
        self.bert = AutoModel.from_pretrained('ai-forever/ruBert-base')
        for param in self.bert.parameters():
            param.requires_grad = False
        self.clf_inner = nn.Sequential(
                nn.Linear(768, 768),
                nn.ReLU(),
        )
        self.classifier = nn.Sequential(
                nn.Linear(768, n_classes)
        )
        self.added = nn.ParameterList([])
    
    def forward(self, input_ids, attention_mask, labels):
        out = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        out = self.clf_inner(out)
        outs = []
        outs.append(self.classifier(out.mean(dim=1)))
        for module in self.added:
            outs.append(module(out.mean(dim=1)))
        
        return torch.cat(outs, dim=-1)
    
    def add(self, n_classes):
        module = nn.Linear(768, n_classes)
        self.added.append(module)
        
    def freeze_except_newest(self):
        for param in self.clf_inner.parameters():
            param.requires_grad = False
        
        for param in self.classifier.parameters():
            param.requires_grad = False
        
        for module in self.added:
            if module == self.added[-1]:
                break
            for param in module.parameters():
                param.requires_grad = False
    
    def unfreeze_all(self):
        for param in self.clf_inner.parameters():
            param.requires_grad = True
        
        for param in self.classifier.parameters():
            param.requires_grad = True
        
        for module in self.added:
            for param in module.parameters():
                param.requires_grad = True

Обучимся на 120 классов

In [72]:
train_df_120 = train_df[train_df["intent"].apply(lambda x : x in le.classes_[:120])]
valid_df_120 = valid_df[valid_df["intent"].apply(lambda x : x in le.classes_[:120])]

train_dataset_120 = Dataset.from_dict(train_df_120).map(preprocess_datasets).remove_columns(columns_remove)
val_dataset_120 = Dataset.from_dict(valid_df_120).map(preprocess_datasets).remove_columns(columns_remove)

  0%|          | 0/11420 [00:00<?, ?ex/s]

  0%|          | 0/765 [00:00<?, ?ex/s]

In [82]:
roflan = RuBertIntentsClassifierWithAddition(120)

In [83]:
training_args = TrainingArguments(
    '/kaggle/working',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit = 1,
    learning_rate=3e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=50,
    weight_decay=0.01,
#     lr_scheduler_type='cosine',
    metric_for_best_model="accuracy",
    load_best_model_at_end=True,
    remove_unused_columns=False,
    label_names=['labels'],
    report_to="wandb"
)

trainer = Trainer(
        roflan,
        training_args,                             
        train_dataset=train_dataset_120,
        eval_dataset=val_dataset_120,
        data_collator=DataCollatorWithPadding(tokenizer),
        compute_metrics=compute_accuracy,
)
trainer.compute_loss = compute_loss

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [84]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.306631,0.690196
2,2.135800,0.685618,0.830065
3,0.725700,0.484466,0.861438
4,0.725700,0.383283,0.88366
5,0.447300,0.323829,0.888889
6,0.339300,0.306319,0.888889
7,0.339300,0.27714,0.904575
8,0.283300,0.251261,0.91634
9,0.247000,0.250489,0.912418
10,0.228800,0.239732,0.90719


TrainOutput(global_step=17850, training_loss=0.24168076459099264, metrics={'train_runtime': 597.1113, 'train_samples_per_second': 956.271, 'train_steps_per_second': 29.894, 'total_flos': 0.0, 'train_loss': 0.24168076459099264, 'epoch': 50.0})

In [85]:
trainer.evaluate()

{'eval_loss': 0.1664794385433197,
 'eval_accuracy': 0.9359477162361145,
 'eval_runtime': 0.4859,
 'eval_samples_per_second': 1574.269,
 'eval_steps_per_second': 49.389,
 'epoch': 50.0}

In [86]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▅▆▇▇▇▇▇▇██▇▇▇▇█████████████████████████
eval/loss,█▄▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▂▃▁▂▂▁▁█▂▂▂▂▂▃▃▂▂▃▄▂▆▄▂▂▂▁▃▆▁▃▄▄▆▃▂▃▃▂▃▄
eval/samples_per_second,▇▅█▇▇██▁▇▇▇▇▇▆▆▇▇▆▅▇▃▅▇▇▇█▆▃█▆▅▅▃▆▆▆▆▇▆▅
eval/steps_per_second,▇▅█▇▇██▁▇▇▇▇▇▆▆▇▇▆▅▇▃▅▇▇▇█▆▃█▆▅▅▃▆▆▆▆▇▆▅
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,█▇▇▆▇▅▄▅▄▅▄▅▅▃▇▄▆▅▃▄▄▆▄▆▄▁▂▃▄▄▂▅▂▃▃
train/learning_rate,███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train/loss,█▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/accuracy,0.93595
eval/loss,0.16648
eval/runtime,0.4859
eval/samples_per_second,1574.269
eval/steps_per_second,49.389
train/epoch,50.0
train/global_step,17850.0
train/grad_norm,1.27656
train/learning_rate,1e-05
train/loss,0.1227


Теперь дообучимся еще на 22 класса

In [87]:
roflan.add(22)
roflan.freeze_except_newest()

In [88]:
roflan.added.to('cuda')

ParameterList(
    (0): Object of type: Linear
  (0): Linear(in_features=768, out_features=22, bias=True)
)

In [89]:
training_args = TrainingArguments(
    '/kaggle/working',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit = 1,
    learning_rate=3e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=50,
    weight_decay=0.01,
#     lr_scheduler_type='cosine',
    metric_for_best_model="accuracy",
    load_best_model_at_end=True,
    remove_unused_columns=False,
    label_names=['labels'],
    report_to="wandb"
)

trainer = Trainer(
        roflan,
        training_args,                             
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=DataCollatorWithPadding(tokenizer),
        compute_metrics=compute_accuracy,
)
trainer.compute_loss = compute_loss

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [90]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.332169,0.892412
2,0.458000,0.263332,0.91393
3,0.241000,0.22713,0.920725
4,0.189700,0.212908,0.92299
5,0.175000,0.2014,0.929785
6,0.175000,0.194955,0.933182
7,0.163700,0.196946,0.929785
8,0.156300,0.19151,0.934315
9,0.155700,0.184957,0.935447
10,0.148200,0.183529,0.933182


TrainOutput(global_step=20700, training_loss=0.14851047727796768, metrics={'train_runtime': 661.1483, 'train_samples_per_second': 1000.532, 'train_steps_per_second': 31.309, 'total_flos': 0.0, 'train_loss': 0.14851047727796768, 'epoch': 50.0})

In [91]:
trainer.evaluate()

{'eval_loss': 0.17539402842521667,
 'eval_accuracy': 0.9388448596000671,
 'eval_runtime': 0.5785,
 'eval_samples_per_second': 1526.422,
 'eval_steps_per_second': 48.403,
 'epoch': 50.0}

Результат получился похуже, чем если обучать сразу на все классы.

In [80]:
wandb.finish()

VBox(children=(Label(value='0.054 MB of 0.054 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/accuracy,▁▅█
eval/loss,█▄▁
eval/runtime,█▁▄
eval/samples_per_second,▁█▅
eval/steps_per_second,▁█▅
train/epoch,▁▂▅▆█
train/global_step,▁▂▄▆█
train/grad_norm,▁█
train/learning_rate,█▁
train/loss,█▁

0,1
eval/accuracy,0.08607
eval/loss,4.70992
eval/runtime,0.5644
eval/samples_per_second,1564.52
eval/steps_per_second,49.611
train/epoch,3.0
train/global_step,1242.0
train/grad_norm,0.48066
train/learning_rate,0.00029
train/loss,4.7919


Реализуем аугментации. Поскольку тексты очень короткие, и учитывая специфику данных, наиболее типичными ошибками будут орфографические и опечатки, поэтому их и будем использовать для аугментации

In [10]:
char_aug = CharAug(
    unit_prob=0, # Percentage of the phrase to which augmentations will be applied
    min_aug=1, # Minimum number of augmentations
    max_aug=2, # Maximum number of augmentations
    lang="rus", # supports: "rus", "eng"
    platform="pc", # supports: "pc", "mobile"
    random_seed=228,
)

Посмотрим на пример аугментации

In [12]:
text = "привет дядя анзон"
char_aug.augment(char_aug.augment(text=text, action="orfo"), action="typo")

'привет бядя анзон'

In [13]:
def augment_wrapper(x):
    phrase = x["phrase"]
    intent = x["intent"][0]
    phrase = char_aug.augment(char_aug.augment(text=phrase, action="orfo"), action="typo")
    boba = preprocess_datasets({"phrase" : phrase, "intent" : intent})
    boba.pop('token_type_ids')
    return pd.DataFrame.from_dict(pd.DataFrame.from_dict({k : [v] for k, v in boba.items()}))

In [14]:
train_ds = Dataset.from_dict(train_df)
train_ds.set_transform(lambda x : augment_wrapper(x))

In [15]:
train_ds[0]

{'input_ids': [101, 1098, 182, 389, 10067, 66508, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1],
 'labels': 114}

In [16]:
train_ds[0]

{'input_ids': [101, 1098, 982, 10067, 66508, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1],
 'labels': 114}

In [17]:
roflan = RuBertIntentsClassifier(len(le.classes_))

pytorch_model.bin:   0%|          | 0.00/716M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [18]:
training_args = TrainingArguments(
    '/kaggle/working',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit = 1,
    learning_rate=3e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=50,
    weight_decay=0.01,
#     lr_scheduler_type='cosine',
    metric_for_best_model="accuracy",
    load_best_model_at_end=True,
    remove_unused_columns=False,
    label_names=['labels'],
    report_to="wandb"
)

trainer = Trainer(
        roflan,
        training_args,                             
        train_dataset=train_ds,
        eval_dataset=val_dataset,
        data_collator=DataCollatorWithPadding(tokenizer),
        compute_metrics=compute_accuracy,
)
trainer.compute_loss = compute_loss

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [19]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.732982,0.605889
2,2.982700,1.015515,0.746319
3,1.520300,0.688396,0.806342
4,1.060900,0.560012,0.84145
5,0.849700,0.455593,0.866365
6,0.849700,0.39486,0.879955
7,0.733200,0.363581,0.881087
8,0.652100,0.319162,0.89581
9,0.601200,0.32611,0.892412
10,0.557900,0.290869,0.899207


TrainOutput(global_step=20700, training_loss=0.5148038348138044, metrics={'train_runtime': 1358.2803, 'train_samples_per_second': 487.013, 'train_steps_per_second': 15.24, 'total_flos': 0.0, 'train_loss': 0.5148038348138044, 'epoch': 50.0})

In [21]:
trainer.evaluate()

{'eval_loss': 0.19421540200710297,
 'eval_accuracy': 0.9331823587417603,
 'eval_runtime': 0.5986,
 'eval_samples_per_second': 1475.029,
 'eval_steps_per_second': 46.773,
 'epoch': 50.0}

Реузультат получился чуть хуже чем для остальных моделей. На этом датасете не было проблем с переобучением и без аугментаций

In [22]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.024 MB uploaded\r'), FloatProgress(value=0.05475402081362346, max=1.…

0,1
eval/accuracy,▁▄▅▆▇▇▇▇▇▇██▇███████████████████████████
eval/loss,█▅▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▄▁▄▂▂▂▁▂▃▄▃▃▂▃▁▃▄▂▂▃▂▆▃▄▂▃▂▃▂▂▃▃█▄▂▄▅▂▃▇
eval/samples_per_second,▅█▅▇▇▇█▇▆▅▆▆▇▆█▆▅▆▇▆▇▃▆▅▇▆▇▅▇▇▆▆▁▅▇▅▄▇▆▂
eval/steps_per_second,▅█▅▇▇▇█▇▆▅▆▆▇▆█▆▅▆▇▆▇▃▆▅▇▆▇▅▇▇▆▆▁▅▇▅▄▇▆▂
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
train/grad_norm,▇▆▇▇▄▄▃▅▃▄▄█▄▅▄▂▃▃▃▄▄▂▄▄▄▄▆▄▅▅▂▃▄▄▃▅▅▄▆▁
train/learning_rate,███▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▁▁
train/loss,█▄▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
eval/accuracy,0.93318
eval/loss,0.19422
eval/runtime,0.5986
eval/samples_per_second,1475.029
eval/steps_per_second,46.773
train/epoch,50.0
train/global_step,20700.0
train/grad_norm,1.27828
train/learning_rate,0.0
train/loss,0.3083


С точки зрения метрики лучшей получилась модель, которая обучается на все классы сразу, потом та, где дообучались, потом модель с аугментациями. Преимущества дообучения в том, что можно распространить модель на новые классы без переобучения уже обученных весов для остальных классов (это может быть существенно, если на обучение требуется много времени), и вообще позволяет хорошо масштабировать модель. По времени обучения получается сопоставимо с моделью, но чуть быстрее за счет того, что меньше классов. Последняя модель самая долгая, потому что там нельзя сгенерить датасет сразу же, нужно делать случаные аугментации на каждой эпохе. Модель становится более робастной, но в данном случае не удается хорошо это проследить, потому что при обучении предыдущих моделей лоссы на трейне и на валидации все время уменьшаются, не возникает переобучения. Вообще, можно было бы попробовать получить семантические аугментации, потому что самое важное  при обучениие трансформеров: это большая выборка, а имеющиеся у нас выборки не такие большие. Можно попробовать использовать идею из LLaVA с генерацией по описанию с помощью LLM.