In [1]:
from transformers import AutoTokenizer,TrainingArguments, Trainer, AutoModelForSequenceClassification
from datasets import load_dataset
import torch

In [2]:
def fine_tune_lithuanian_model():
    """
    Настройка и дообучение модели для литовского языка.
    """
    model_name = "xlm-roberta-base"
    dataset_name = "SkyWater21/lt_go_emotions"

    # Загрузка токенизатора и модели
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=7, ignore_mismatched_sizes=True)  # 28 категорий эмоций

    # Загрузка датасета
    dataset = load_dataset(dataset_name)
    
    def preprocess_labels(example):
        # Find the index of the first '1' in the labels list
        label_index = example["labels"].index(1) if 1 in example["labels"] else 0 
        example["labels"] = label_index  # Use the index as the single label
        return example

    # Токенизация данных
    def tokenize_function(examples):
        return tokenizer(examples["lt_text"], padding="max_length", truncation=True, max_length=128)

    dataset = dataset.map(preprocess_labels)
    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    # Настройка аргументов для тренировки
    training_args = TrainingArguments(
        output_dir="./results",          # директория для сохранения результатов
        evaluation_strategy="epoch",    # стратегия оценки
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        save_strategy="epoch",
        logging_dir="./logs",
    )

    # Инициализация тренера
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        tokenizer=tokenizer,
    )

    # Запуск тренировки
    trainer.train()

    # Сохранение дообученной модели
    trainer.save_model(r"C:\Users\Viktor\Documents\TMS\NLP\NLP_psyhology_support\models\lt_emotion_model_v1_1")
    print("Литовская модель дообучена и сохранена.")

In [3]:
    # Дообучение литовской модели
    fine_tune_lithuanian_model()

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/5427 [00:00<?, ? examples/s]

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.0283,0.020916
2,0.0291,0.020386
3,0.0255,0.019649


Литовская модель дообучена и сохранена.


NameError: name 'trainer' is not defined

In [5]:
fine_tune_lithuanian_model()

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.0285,0.02078
2,0.0288,0.020255
3,0.0247,0.01839


Литовская модель дообучена и сохранена.
