In [1]:
import random
import time

import numpy as np
import pandas as pd
import torch
from torch import nn

import evaluate
from datasets import load_dataset
from peft import (get_peft_model,
                  LoraConfig,
                  PrefixTuningConfig,
                  TaskType,)
from transformers import (AutoModelForSequenceClassification,
                          AutoTokenizer,
                          DataCollatorWithPadding,
                          Trainer, 
                          TrainingArguments,)

# Фиксация сидов для воспроизводимости
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# Устройство
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## 0. Загрузка/подготовка

In [2]:
dataset = load_dataset("dair-ai/emotion")

In [3]:
model_checkpoint = "google-bert/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [4]:
def tokenize(example):
    return tokenizer(example["text"], truncation=True)

In [5]:
dataset = dataset.map(tokenize, batched=True)

In [6]:
label_names = dataset['train'].features['label'].names
num_labels = len(label_names)
label_names

['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']

## 1. Метрики до дообучения

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
accuracy_metric  = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    return {
        "accuracy": accuracy_metric.compute(predictions=preds, references=labels)["accuracy"],
        "precision": precision_metric.compute(predictions=preds, references=labels, average="macro", zero_division=0)["precision"],
        "recall": recall_metric.compute(predictions=preds, references=labels, average="macro", zero_division=0)["recall"],
        "f1_macro": f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"],
    }

In [9]:
training_args = TrainingArguments(
    output_dir="./pretrain",
    eval_strategy="epoch",
    per_device_eval_batch_size=16,
)

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    eval_dataset=dataset['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [11]:
trainer.evaluate(dataset['test'])

{'eval_loss': 1.6839704513549805,
 'eval_model_preparation_time': 0.0,
 'eval_accuracy': 0.151,
 'eval_precision': 0.14487953510576054,
 'eval_recall': 0.1618697266408435,
 'eval_f1_macro': 0.07286239293441214,
 'eval_runtime': 2.107,
 'eval_samples_per_second': 949.235,
 'eval_steps_per_second': 59.327}

In [12]:
torch.cuda.memory_allocated() / (1024 ** 2)

426.87255859375

## 2. Full finetuning

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
def train_with_memory_and_metrics(trainer):
    num_params = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad)

    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    start = time.time()
    trainer.train()
    elapsed = time.time() - start

    results = trainer.evaluate(dataset["test"])

    mem_mb = torch.cuda.max_memory_allocated() / (1024 ** 2)

    print(f"Количество обучаемых параметров: {num_params}")
    print(f"Время: {elapsed:.2f} s")
    print(f"Метрики: {results}")
    print(f"Использование памяти gpu: {mem_mb:.2f} MB")

    return {
        "metrics": results,
        "time_s": elapsed,
        "mem_mb": mem_mb,
        "num_params": num_params
    }

Аргументы трейнера выбраны одинаковыми для всех видов дообучения для сравнения:
- оптимизатор AdamW (по умолчанию в HF Trainer)
- lr=2e-5 — стандарт для дообучения BERT-подобных
- batch_size=16 — для укладывания в память
- epochs=15 — даёт стабильную сходимость в данном случае

In [None]:
training_args = TrainingArguments(
    output_dir="./full_finetining",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
    seed=SEED,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [16]:
full_finetining_metrics = train_with_memory_and_metrics(trainer)

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1 Macro
1,0.2284,0.199299,0.929,0.896605,0.917308,0.903498
2,0.147,0.15842,0.9345,0.927437,0.891392,0.906706
3,0.1108,0.15115,0.9425,0.913778,0.926093,0.919284
4,0.1051,0.19549,0.9395,0.921116,0.918304,0.918506
5,0.0678,0.269534,0.9345,0.916425,0.90758,0.911728
6,0.0489,0.312168,0.9325,0.911735,0.90506,0.908158
7,0.0363,0.29844,0.9405,0.923355,0.9141,0.918547
8,0.0289,0.396841,0.9385,0.92222,0.912749,0.917302
9,0.0126,0.367114,0.939,0.927469,0.910057,0.9179
10,0.0224,0.381348,0.9385,0.924143,0.909212,0.915893


Количество обучаемых параметров: 109486854
Время: 1453.84 s
Метрики: {'eval_loss': 0.4773561954498291, 'eval_accuracy': 0.9295, 'eval_precision': 0.8865127398653881, 'eval_recall': 0.885334840654208, 'eval_f1_macro': 0.8857581442879335, 'eval_runtime': 2.0, 'eval_samples_per_second': 999.985, 'eval_steps_per_second': 62.499, 'epoch': 15.0}
Использование памяти gpu: 2153.35 MB


## 3. Linear probing

LayerNorm → Linear → ReLU → Dropout → LayerNorm → Linear
- Простая feed-forward сеть с двумя линейными проекциями, нормализацией и дропаутом, Dropout и LayerNorm помогают стабилизировать обучение и предотвратить переобучение в условиях малого числа обучаемых параметров.

In [17]:
class LinearProbeHead(torch.nn.Module):
    def __init__(self, hidden_size, num_labels):
        super().__init__()
        self.classifier = nn.Sequential(
            nn.LayerNorm(hidden_size),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.LayerNorm(hidden_size // 2),
            nn.Linear(hidden_size // 2, num_labels)
        )

    def forward(self, x):
        return self.classifier(x)

In [18]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
model.classifier = LinearProbeHead(hidden_size=model.config.hidden_size, num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# freeze BERT: обучаем только голову (~300k параметров) для быстрой отладки
for name, param in model.named_parameters():
    if not name.startswith("classifier"):
        param.requires_grad = False

In [20]:
training_args = TrainingArguments(
    output_dir="./linear_probing",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
    seed=SEED,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [21]:
linear_probing_metrics = train_with_memory_and_metrics(trainer)

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1 Macro
1,1.5724,1.535964,0.427,0.143386,0.227367,0.174668
2,1.534,1.481792,0.45,0.147962,0.235142,0.181445
3,1.5035,1.454042,0.4755,0.156435,0.248939,0.192047
4,1.4787,1.431136,0.4805,0.260956,0.256108,0.200059
5,1.462,1.403598,0.471,0.28746,0.257093,0.207948
6,1.4505,1.41027,0.4865,0.275933,0.260057,0.201708
7,1.447,1.384009,0.5015,0.315395,0.267122,0.213717
8,1.4293,1.360353,0.499,0.295384,0.267785,0.215956
9,1.4318,1.360855,0.504,0.507574,0.279196,0.237196
10,1.4315,1.367918,0.498,0.48227,0.273465,0.225655


Количество обучаемых параметров: 299910
Время: 359.18 s
Метрики: {'eval_loss': 1.310072660446167, 'eval_accuracy': 0.503, 'eval_precision': 0.405377564086365, 'eval_recall': 0.2717470693308338, 'eval_f1_macro': 0.23209638260975293, 'eval_runtime': 2.0317, 'eval_samples_per_second': 984.413, 'eval_steps_per_second': 61.526, 'epoch': 15.0}
Использование памяти gpu: 1274.20 MB


## 4. Prefix tuning

Выбрала Prefix tuning:
- Prefix Tuning создаёт виртуальные ключи (K) и значения (V) для механизма self-attention
  на каждом слое. Это даёт более «глубокий» контроль над распределением внимания,
  что особенно важно в задачах с тонкими контекстными различиями.
  При этом основные веса модели остаются замороженными, а обучается лишь небольшой
  объём префиксных параметров

In [22]:
prefix_config = PrefixTuningConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    num_virtual_tokens=20,
    prefix_projection=True
)


In [23]:
base_model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
model = get_peft_model(base_model, prefix_config)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
model.print_trainable_parameters()

trainable params: 14,780,160 || all params: 124,267,014 || trainable%: 11.8939


In [25]:
training_args = TrainingArguments(
    output_dir="./prefix_tuning",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    weight_decay=0.01,
    seed=SEED,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset['validation'],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [26]:
prefix_tuning_metrics = train_with_memory_and_metrics(trainer)

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1 Macro
1,1.2993,1.18933,0.5765,0.197402,0.307434,0.237044
2,1.1191,1.019686,0.6535,0.307044,0.395256,0.343401
3,0.9572,0.85743,0.733,0.455334,0.505883,0.467548
4,0.8185,0.714534,0.7995,0.683223,0.598204,0.563844
5,0.7089,0.585164,0.875,0.863506,0.776042,0.801802
6,0.6232,0.516942,0.899,0.874307,0.844231,0.853018
7,0.573,0.473891,0.9135,0.897855,0.871263,0.881269
8,0.5432,0.44732,0.9135,0.877711,0.892905,0.884565
9,0.513,0.429057,0.919,0.898433,0.883951,0.889593
10,0.4994,0.419866,0.9205,0.899152,0.892237,0.892948


Количество обучаемых параметров: 14780160
Время: 712.65 s
Метрики: {'eval_loss': 0.39882901310920715, 'eval_accuracy': 0.9175, 'eval_precision': 0.8745803288345738, 'eval_recall': 0.8706279087832317, 'eval_f1_macro': 0.8714575933344596, 'eval_runtime': 2.2662, 'eval_samples_per_second': 882.53, 'eval_steps_per_second': 55.158, 'epoch': 15.0}
Использование памяти gpu: 1231.53 MB


## 5. LoRA

In [None]:
r_values = [4, 8, 16, 32]
lora_experiments = []

for r in r_values:
    print(f"========= LoRA: r={r} =========")
    # 1) Базовая модель
    base = AutoModelForSequenceClassification.from_pretrained(model_checkpoint,num_labels=num_labels)

    lora_cfg = LoraConfig(
        task_type=TaskType.SEQ_CLS,
        inference_mode=False,
        r=r,
        lora_alpha=r * 4,      # часто alpha = r*X, где X из [1..4], чтобы было масштабирование адаптеров
        lora_dropout=0.1,
        target_modules=["query", "value"]
    )

    lora_model = get_peft_model(base, lora_cfg)
    lora_model.to(device)

    # 3) Trainer для LoRA
    args = TrainingArguments(
        output_dir=f"./lora_r{r}",
        eval_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=15,
        weight_decay=0.01,
        seed=SEED,)
    
    trainer = Trainer(
        model=lora_model,
        args=args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["validation"],
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # 4) Запуск обучения + оценки
    res = train_with_memory_and_metrics(trainer)
    res["r"] = r
    lora_experiments.append(res)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1 Macro
1,1.508,1.352425,0.5275,0.179383,0.281449,0.216507
2,1.1918,1.128044,0.577,0.363874,0.30837,0.238922
3,1.0768,1.013305,0.6035,0.495377,0.352494,0.313696
4,0.9453,0.892368,0.6485,0.708553,0.454681,0.424758
5,0.8614,0.799344,0.6825,0.684464,0.500093,0.477441
6,0.8039,0.735382,0.7115,0.737918,0.536221,0.544251
7,0.7426,0.678475,0.743,0.735738,0.592921,0.608466
8,0.7002,0.638714,0.7605,0.732017,0.634975,0.653188
9,0.6776,0.601813,0.7785,0.763902,0.658144,0.684174
10,0.6384,0.579892,0.79,0.767874,0.677944,0.702255


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Количество обучаемых параметров: 152070
Время: 642.32 s
Метрики: {'eval_loss': 0.4835277795791626, 'eval_accuracy': 0.829, 'eval_precision': 0.7926230999087097, 'eval_recall': 0.7269297903006645, 'eval_f1_macro': 0.7512637486932152, 'eval_runtime': 2.1476, 'eval_samples_per_second': 931.271, 'eval_steps_per_second': 58.204, 'epoch': 15.0}
Использование памяти gpu: 1624.60 MB


No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1 Macro
1,1.3206,1.188959,0.565,0.192644,0.301392,0.232054
2,1.1235,1.067242,0.588,0.202671,0.313873,0.242214
3,0.9557,0.86497,0.6595,0.588888,0.455014,0.422883
4,0.8117,0.771365,0.691,0.666203,0.516473,0.491073
5,0.7389,0.679381,0.734,0.691089,0.572374,0.570135
6,0.685,0.61443,0.7675,0.773187,0.618862,0.64359
7,0.6155,0.542913,0.7985,0.782376,0.679259,0.701688
8,0.5761,0.495634,0.825,0.797809,0.744841,0.763528
9,0.5508,0.461885,0.835,0.811135,0.747983,0.770462
10,0.51,0.436101,0.85,0.826677,0.783131,0.79915


Количество обучаемых параметров: 299526
Время: 642.23 s
Метрики: {'eval_loss': 0.3774574100971222, 'eval_accuracy': 0.8715, 'eval_precision': 0.8221311657385462, 'eval_recall': 0.8033526009307427, 'eval_f1_macro': 0.8111128190851408, 'eval_runtime': 2.1382, 'eval_samples_per_second': 935.379, 'eval_steps_per_second': 58.461, 'epoch': 15.0}
Использование памяти gpu: 1628.76 MB


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1 Macro
1,1.2213,1.128124,0.583,0.199886,0.311042,0.239805
2,1.0412,0.927677,0.6375,0.443091,0.409069,0.383253
3,0.8008,0.724905,0.732,0.614309,0.560295,0.552399
4,0.6655,0.601161,0.775,0.773729,0.631481,0.647141
5,0.5776,0.50071,0.828,0.816004,0.733133,0.757751
6,0.5095,0.445828,0.843,0.829358,0.746796,0.777206
7,0.45,0.389147,0.871,0.844545,0.818995,0.829622
8,0.4334,0.367373,0.8875,0.853812,0.860372,0.855201
9,0.3911,0.334151,0.889,0.859636,0.850714,0.853827
10,0.3722,0.31342,0.8985,0.867644,0.866968,0.867079


Количество обучаемых параметров: 594438
Время: 643.63 s
Метрики: {'eval_loss': 0.2778737246990204, 'eval_accuracy': 0.9045, 'eval_precision': 0.8559403881670988, 'eval_recall': 0.8646255591482802, 'eval_f1_macro': 0.8595715039991082, 'eval_runtime': 2.1585, 'eval_samples_per_second': 926.58, 'eval_steps_per_second': 57.911, 'epoch': 15.0}
Использование памяти gpu: 1636.13 MB


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1 Macro
1,1.1613,1.041651,0.603,0.279227,0.338065,0.284661
2,0.8737,0.759423,0.722,0.690683,0.523063,0.525195
3,0.6681,0.570412,0.8105,0.802108,0.695799,0.712243
4,0.5535,0.476883,0.8455,0.843893,0.733451,0.758296
5,0.4693,0.388643,0.8795,0.856225,0.821467,0.836462
6,0.395,0.327499,0.906,0.880971,0.859342,0.86547
7,0.3441,0.300885,0.9105,0.895904,0.870264,0.881512
8,0.33,0.2807,0.916,0.884613,0.894377,0.889169
9,0.2899,0.266553,0.9205,0.893514,0.890402,0.891752
10,0.2805,0.256258,0.92,0.889338,0.895613,0.891733


Количество обучаемых параметров: 1184262
Время: 647.50 s
Метрики: {'eval_loss': 0.24380642175674438, 'eval_accuracy': 0.914, 'eval_precision': 0.8627140297875124, 'eval_recall': 0.8685216879504312, 'eval_f1_macro': 0.865518094629688, 'eval_runtime': 2.1585, 'eval_samples_per_second': 926.556, 'eval_steps_per_second': 57.91, 'epoch': 15.0}
Использование памяти gpu: 1642.61 MB


In [28]:
df_lora = pd.DataFrame([{
    "r": e["r"],
    "trainable_params_K": e["num_params"] / 1e3,
    "time_s": e["time_s"],
    "mem_MB": e["mem_mb"],
    "accuracy": e["metrics"]["eval_accuracy"],
    "precision": e["metrics"]["eval_precision"],
    "recall": e["metrics"]["eval_recall"],
    "f1_macro": e["metrics"]["eval_f1_macro"]
} for e in lora_experiments])
df_lora

Unnamed: 0,r,trainable_params_K,time_s,mem_MB,accuracy,precision,recall,f1_macro
0,4,152.07,642.315216,1624.602051,0.829,0.792623,0.72693,0.751264
1,8,299.526,642.233332,1628.762207,0.8715,0.822131,0.803353,0.811113
2,16,594.438,643.630512,1636.133301,0.9045,0.85594,0.864626,0.859572
3,32,1184.262,647.497522,1642.61377,0.914,0.862714,0.868522,0.865518


Подбирала ранги r {4,8,16,32}, лучший компромисс r=32, 
- несмотря на рост обучаемых параметров, время дообучения и занимаемая память не сильно растут в значениях по мере роста r, но метрики становятся значительно лучше

## 6. Сравнение

| Метод               | Параметров   | Память (MB) | Время (s) | Accuracy | Precision | Recall  | F1      |
|---------------------|--------------|-------------|-----------|----------|-----------|---------|---------|
| До finetuning       | n/a          | n/a         | n/a       | 0.151    | 0.1449    | 0.1619  | 0.0729  |
| Full finetuning     | 109 486 854  | 2153.35     | 1453.84   | 0.9295   | 0.8865    | 0.8853  | 0.8858  |
| Linear probing      |   299 910    | 1274.20     | 359.18    | 0.5030   | 0.4054    | 0.2717  | 0.2321  |
| Prefix tuning       | 14 780 160   | 1231.53     | 712.65    | 0.9175   | 0.8746    | 0.8706  | 0.8715  |
| LoRA (r=32)         | 1 184 262    | 1642.61     | 647.50    | 0.9140   | 0.8627    | 0.8685  | 0.8655  |

- Full FT даёт максимальное качество (F1 0.886, accuracy 0.929), но требует около 2 GB памяти и наибольшее количество времени 1450 s
- Prefix Tuning почти не уступает (F1 0.871) при halved памяти (1.2 GB) и времени (713 s).
- LoRA (r=32) обучает всего ~1 M параметров, сохраняя высокое качество (F1≈0.866) и средние ресурсы (~1.6 GB, 647 s).
- Linear Probing очень легковесен (300 k параметров, 359 s), но качество довольно низкое (F1 0.23).