In [None]:
!wget -O train.jsonl 'https://www.dropbox.com/scl/fi/m9bmjd8soszx92qrtxxh1/train.jsonl?rlkey=lq4gayi85gdw3u2197z27rjd0&st=xqteb89h&dl=1'

--2025-03-25 22:32:04--  https://www.dropbox.com/scl/fi/m9bmjd8soszx92qrtxxh1/train.jsonl?rlkey=lq4gayi85gdw3u2197z27rjd0&st=xqteb89h&dl=1


In [2]:
!tar -cvf mt5-e15.tar tuned/mt5-2_4/checkpoint-3675

tuned/mt5-2_4/checkpoint-3675/
tuned/mt5-2_4/checkpoint-3675/config.json
tuned/mt5-2_4/checkpoint-3675/training_args.bin
tuned/mt5-2_4/checkpoint-3675/rng_state.pth
tuned/mt5-2_4/checkpoint-3675/generation_config.json
tuned/mt5-2_4/checkpoint-3675/optimizer.pt
tuned/mt5-2_4/checkpoint-3675/scheduler.pt
tuned/mt5-2_4/checkpoint-3675/model.safetensors
tuned/mt5-2_4/checkpoint-3675/tokenizer_config.json
tuned/mt5-2_4/checkpoint-3675/special_tokens_map.json
tuned/mt5-2_4/checkpoint-3675/trainer_state.json
tuned/mt5-2_4/checkpoint-3675/spiece.model


In [1]:
!tar -cvf mbart2_3.tar tuned/mbart2_3/checkpoint-1880

tuned/mbart2_3/checkpoint-1880/
tuned/mbart2_3/checkpoint-1880/config.json
tuned/mbart2_3/checkpoint-1880/training_args.bin
tuned/mbart2_3/checkpoint-1880/rng_state.pth
tuned/mbart2_3/checkpoint-1880/generation_config.json
tuned/mbart2_3/checkpoint-1880/sentencepiece.bpe.model
tuned/mbart2_3/checkpoint-1880/optimizer.pt
tuned/mbart2_3/checkpoint-1880/scheduler.pt
tuned/mbart2_3/checkpoint-1880/model.safetensors
tuned/mbart2_3/checkpoint-1880/tokenizer_config.json
tuned/mbart2_3/checkpoint-1880/special_tokens_map.json
tuned/mbart2_3/checkpoint-1880/trainer_state.json
tuned/mbart2_3/checkpoint-1880/tokenizer.json


In [None]:
import os
import torch
import wandb
from usecrets import WANDB_API_KEY


os.environ['WANDB_API_KEY'] = WANDB_API_KEY

from transformers import (
    AutoTokenizer,
    MBartForConditionalGeneration,
    MT5ForConditionalGeneration,
    MT5Tokenizer,
    Trainer,
    TrainingArguments,
    TrainerCallback
)
from datasets import load_dataset, Dataset
from bert_score import score as bert_score_metric

import config


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running on: {device}")
if device.type == "cuda":
    print("GPU Device Name:", torch.cuda.get_device_name(0))


wandb.init(
    project=config.WANDB_PROJECT,
    name=config.WANDB_RUN_NAME,
    config={
        "num_train_epochs": config.NUM_EPOCHS,
        "batch_size": config.BATCH_SIZE,
        "warmup_steps": config.WARMUP_STEPS,
        "weight_decay": config.WEIGHT_DECAY,
    }
)

raw_dataset = load_dataset("json", data_files="train.jsonl", field=None)["train"]
split_dataset = raw_dataset.train_test_split(test_size=0.1, seed=52)
train_raw = split_dataset["train"]
val_raw = split_dataset["test"]

model_name = "google/mt5-small"
tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name).to(device)
# model.config.dropout = 0.15
# model.config.attention_dropout = 0.10
# model.config.classifier_dropout = 0.15

tokenizer.src_lang = "ru_RU"
tokenizer.tgt_lang = "ru_RU"

max_input_length = 1024
max_target_length = 128


def chunk_preprocess(example):
    text = example["text"]
    summary = example["summary"]
    input_ids = tokenizer.encode(text, add_special_tokens=True)
    target_ids = tokenizer.encode(summary, add_special_tokens=True)
    
    if len(input_ids) > max_input_length:
        chunk_examples = []
        for i in range(0, len(input_ids), max_input_length):
            chunk_ids = input_ids[i:i+max_input_length]
            padded_chunk = chunk_ids + [tokenizer.pad_token_id] * (max_input_length - len(chunk_ids))
            attention_mask = [1] * len(chunk_ids) + [0] * (max_input_length - len(chunk_ids))

            padded_target_ids = target_ids[:max_target_length]
            if len(padded_target_ids) < max_target_length:
                padded_target_ids += [tokenizer.pad_token_id] * (max_target_length - len(padded_target_ids))
            
            chunk_examples.append({
                "input_ids": padded_chunk,
                "attention_mask": attention_mask,
                "labels": padded_target_ids,
            })
        return chunk_examples
    else:
        encoded_inputs = tokenizer(
            text, max_length=max_input_length, truncation=True, padding="max_length"
        )
        with tokenizer.as_target_tokenizer():
            encoded_targets = tokenizer(
                summary, max_length=max_target_length, truncation=True, padding="max_length"
            )
        encoded_inputs["labels"] = encoded_targets["input_ids"]
        return encoded_inputs

def process_dataset(dataset):
    examples = []
    for example in dataset:
        processed = chunk_preprocess(example)
        if isinstance(processed, list):
            examples.extend(processed)
        else:
            examples.append(processed)
    return examples

train_examples = process_dataset(train_raw)
val_examples = process_dataset(val_raw)

train_dataset = Dataset.from_list(train_examples)
val_dataset = Dataset.from_list(val_examples)

class MetricsLoggerCallback(TrainerCallback):
    def __init__(self, model, tokenizer, eval_dataset, log_file="training_log.txt", max_eval_samples=200):
        super().__init__()
        self.model = model
        self.tokenizer = tokenizer
        self.eval_dataset = eval_dataset
        self.log_file = log_file
        self.max_eval_samples = max_eval_samples

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics is None:
            metrics = {}
        
        # Используем переданный eval_dataset, модель и токенайзер
        subset_size = min(len(self.eval_dataset), self.max_eval_samples)
        small_eval = self.eval_dataset.select(range(subset_size))
        
        all_preds = []
        all_labels = []
        
        for start_idx in range(0, subset_size, args.per_device_eval_batch_size):
            sub_eval = small_eval[start_idx : start_idx + args.per_device_eval_batch_size]
        
            batch_input_ids = sub_eval["input_ids"]
            batch_attn_mask = sub_eval["attention_mask"]
            batch_labels = sub_eval["labels"]
        
            input_ids = torch.tensor(batch_input_ids, dtype=torch.long).to(self.model.device)
            attention_mask = torch.tensor(batch_attn_mask, dtype=torch.long).to(self.model.device)
            labels = torch.tensor(batch_labels, dtype=torch.long).to(self.model.device)
        
            with torch.no_grad():
                generated_tokens = self.model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    max_length=128,
                    num_beams=4,
                    early_stopping=True
                )
        
            preds = self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
            tgts = self.tokenizer.batch_decode(labels, skip_special_tokens=True)
            all_preds.extend(preds)
            all_labels.extend(tgts)


        P, R, F1 = bert_score_metric(
            all_preds, 
            all_labels, 
            lang="ru",
            model_type="google-bert/bert-base-multilingual-cased",
            num_layers=9,
            verbose=False
        )

        p_mean = float(torch.mean(P))
        r_mean = float(torch.mean(R))
        f1_mean = float(torch.mean(F1))

        metrics["eval_bert_score_precision"] = p_mean
        metrics["eval_bert_score_recall"] = r_mean
        metrics["eval_bert_score_f1"] = f1_mean

        wandb.log(metrics)

        os.makedirs(os.path.dirname(self.log_file), exist_ok=True)
        with open(self.log_file, "a", encoding="utf-8") as f:
            f.write(f"Epoch {state.epoch} evaluation metrics:\n")
            for k, v in metrics.items():
                f.write(f"{k}: {v}\n")
            f.write("\n")


optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=config.LEARNING_RATE,
    weight_decay=config.WEIGHT_DECAY
)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode=config.RLR_MODE,    # "min"
    patience=config.RLR_PATIENCE,  # 1
    factor=config.RLR_FACTOR       # 0.8
)

optims = (optimizer, scheduler)

training_args = TrainingArguments(
    output_dir=f"./tuned/{config.WANDB_RUN_NAME}",
    num_train_epochs=config.NUM_EPOCHS,
    per_device_train_batch_size=config.BATCH_SIZE,
    per_device_eval_batch_size=config.BATCH_SIZE,
    learning_rate=config.LEARNING_RATE,
    weight_decay=config.WEIGHT_DECAY,
    lr_scheduler_type='reduce_lr_on_plateau',
    lr_scheduler_kwargs={'mode': config.RLR_MODE, 'patience': config.RLR_PATIENCE, 'factor': config.RLR_FACTOR},
    logging_steps=50,
    eval_strategy="epoch",
    save_total_limit=config.NUM_EPOCHS,
    save_strategy="epoch",
    report_to=["wandb"],
    overwrite_output_dir=True,
    load_best_model_at_end=True,
    metric_for_best_model="eval_bert_score_f1",
    greater_is_better=True
)

# TODO: ReduceLROnPlateau params: patience = 1,  mode="min", patience=1, factor=0.8
# TODO: AdamW params: lr = 1e-3, weight_decay=0.2

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    optimizers=optims,
)

trainer.add_callback(MetricsLoggerCallback(model, tokenizer, val_dataset, log_file=f"logs/{config.WANDB_RUN_NAME}/training_log.txt"))

wandb.watch(model, log="all")

trainer.train()

model.save_pretrained(os.path.join(training_args.output_dir, "final_model"))
tokenizer.save_pretrained(os.path.join(training_args.output_dir, "final_model"))

wandb.finish()


Running on: cuda
GPU Device Name: NVIDIA A100 80GB PCIe


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mvdoninav[0m ([33mvdoninav-hse[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'MT5Tokenizer'.
You are using the default legacy behaviour of the <class 'transformers.models.mt5.tokenization_mt5.MT5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_val

Epoch,Training Loss,Validation Loss
