<a href="https://colab.research.google.com/github/vroner1/NLP-homework/blob/main/NLP_HW3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fine-tune a pretrained model

In [None]:
from datasets import load_dataset

# Загружаем по 20% от train и test
raw_dataset = load_dataset("dair-ai/emotion", split={'train': 'train[:1%]', 'test': 'test[:1%]'})

print(f"Train dataset size (20%): {len(raw_dataset['train'])}")
print(f"Test dataset size (20%): {len(raw_dataset['test'])}")


Train dataset size (20%): 160
Test dataset size (20%): 20


In [None]:
# преобразуем токены в соответствующие им идентификаторы в предобученном словаре

from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = raw_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 160/160 [00:00<00:00, 1178.44 examples/s]
Map: 100%|██████████| 20/20 [00:00<00:00, 559.34 examples/s]


In [None]:
print(tokenized_datasets["train"].column_names)

['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask']


In [None]:
# используем модель BERT base model (uncased) https://huggingface.co/google-bert/bert-base-uncased

from transformers import AutoModelForSequenceClassification

model_id = "bert-base-uncased"

labels = tokenized_datasets["train"].features["label"].names
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

model = AutoModelForSequenceClassification.from_pretrained(
    model_id, num_labels=num_labels, label2id=label2id, id2label=id2label
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import evaluate
import numpy as np

metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels, average="weighted")

In [None]:
from transformers import Trainer, TrainingArguments

In [None]:
training_args = TrainingArguments(
    output_dir='./out',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    eval_strategy='epoch',
    logging_steps=50
)

trainer = Trainer(
    model=model.to(device),
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,No log,1.583864,0.138462




TrainOutput(global_step=20, training_loss=1.6927305221557618, metrics={'train_runtime': 403.2039, 'train_samples_per_second': 0.397, 'train_steps_per_second': 0.05, 'total_flos': 42099280773120.0, 'train_loss': 1.6927305221557618, 'epoch': 1.0})

К сожалению, из-за ограниченных вычислительных ресурсов обучение на большем объеме данных оказалось невозможным

Для получения качественных результатов модель желательно обучать на гораздо большей выборке и с другими гиперпараметрами

Например, так:

training_args = TrainingArguments(
    output_dir="./out",
    eval_steps=500,
    save_steps=500,
    logging_steps=100,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    num_train_epochs=3,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

model_id = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=num_labels, label2id=label2id, id2label=id2label)
tokenizer = AutoTokenizer.from_pretrained(model_id)

model.to(device)
model.eval()

def predict_emotion(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = F.softmax(logits, dim=-1)
        pred_id = torch.argmax(probs, dim=-1).item()
        pred_label = id2label[str(pred_id)]
    return pred_label, probs[0][pred_id].item()

text = "I feel so happy and excited today!"
label, confidence = predict_emotion(text)
print(f"Text: {text}\nPredicted emotion: {label} (confidence: {confidence:.2f})") # пример работы модели


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Text: I feel so happy and excited today!
Predicted emotion: joy (confidence: 0.34)
