In [5]:
import os
from transformers import AutoTokenizer, MBartForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset
import torch


device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
dataset = load_dataset("json", data_files="train.jsonl", field=None)

tokenizer = AutoTokenizer.from_pretrained('./ru-mbart-large-summ')
model = MBartForConditionalGeneration.from_pretrained('./ru-mbart-large-summ')
tokenizer.src_lang = "ru_RU"
tokenizer.tgt_lang = "ru_RU"

def preprocess_function(examples):
    inputs = examples["text"]
    targets = examples["summary"]
    
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding=True)
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding=True)
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(preprocess_function, batched=True)

training_args = TrainingArguments(
    output_dir="./tuned",     
    num_train_epochs=3,                      
    per_device_train_batch_size=2,          
    per_device_eval_batch_size=2,            
    warmup_steps=500,                       
    weight_decay=0.01,                       
    logging_dir="./logs",                    
    logging_steps=50,                        
    evaluation_strategy="steps",             
    eval_steps=500,                          
    save_total_limit=2,                      
    save_steps=500,                          
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    tokenizer=tokenizer,
)

trainer.train()

model.save_pretrained(os.path.join(training_args.output_dir, "final_model"))
tokenizer.save_pretrained(os.path.join(training_args.output_dir, "final_model"))




  0%|          | 0/72 [00:00<?, ?it/s]

{'loss': 3.022, 'grad_norm': 4.973212718963623, 'learning_rate': 5e-06, 'epoch': 2.08}




{'train_runtime': 96.0231, 'train_samples_per_second': 1.468, 'train_steps_per_second': 0.75, 'train_loss': 2.9450414975484214, 'epoch': 3.0}


('./tuned/final_model/tokenizer_config.json',
 './tuned/final_model/special_tokens_map.json',
 './tuned/final_model/sentencepiece.bpe.model',
 './tuned/final_model/added_tokens.json',
 './tuned/final_model/tokenizer.json')

In [13]:
%pip install --upgrade accelerate --break-system-packages

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.12 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [18]:
dataset['train'][:10]

{'text': ['–ü—Ä–∏–ª–æ–∂–µ–Ω–∏–µ \n\n–∫ –ø—Ä–∏–∫–∞–∑—É –ù–ò–£ –í–®–≠ \n–æ—Ç 30.07.2021 ‚Ññ 6.18.1-01/300721-5\n\n–£–¢–í–ï–†–ñ–î–ï–ù\n—É—á–µ–Ω—ã–º —Å–æ–≤–µ—Ç–æ–º –§–≠–ù\n–ø—Ä–æ—Ç–æ–∫–æ–ª ‚Ññ 13 –æ—Ç 29.06.2021\n\n\n\n\n–ü–æ—Ä—è–¥–æ–∫ –Ω–∞–∑–Ω–∞—á–µ–Ω–∏—è –∏ –≤—ã–ø–ª–∞—Ç—ã\n—Å—Ç–∏–ø–µ–Ω–¥–∏–π –∏–º–µ–Ω–∏ –õ.–õ. –õ—é–±–∏–º–æ–≤–∞ —Ñ–∞–∫—É–ª—å—Ç–µ—Ç–∞ —ç–∫–æ–Ω–æ–º–∏—á–µ—Å–∫–∏—Ö –Ω–∞—É–∫ \n–ù–∞—Ü–∏–æ–Ω–∞–ª—å–Ω–æ–≥–æ –∏—Å—Å–ª–µ–¥–æ–≤–∞—Ç–µ–ª—å—Å–∫–æ–≥–æ —É–Ω–∏–≤–µ—Ä—Å–∏—Ç–µ—Ç–∞ ¬´–í—ã—Å—à–∞—è —à–∫–æ–ª–∞ —ç–∫–æ–Ω–æ–º–∏–∫–∏¬ª –¥–ª—è —Å—Ç—É–¥–µ–Ω—Ç–æ–≤ –∏ –∞—Å–ø–∏—Ä–∞–Ω—Ç–æ–≤, —É—á–∞—Å—Ç–≤—É—é—â–∏—Ö –≤ –¥–æ–ª–≥–æ—Å—Ä–æ—á–Ω—ã—Ö –ø—Ä–æ–≥—Ä–∞–º–º–∞—Ö –º–µ–∂–¥—É–Ω–∞—Ä–æ–¥–Ω–æ–π –∞–∫–∞–¥–µ–º–∏—á–µ—Å–∫–æ–π –º–æ–±–∏–ª—å–Ω–æ—Å—Ç–∏ –∏ –ø—Ä–æ–≥—Ä–∞–º–º–∞—Ö –¥–≤—É—Ö –¥–∏–ø–ª–æ–º–æ–≤\n\n\n–û–±—â–∏–µ –ø–æ–ª–æ–∂–µ–Ω–∏—è\n–ü–æ—Ä—è–¥–æ–∫ –Ω–∞–∑–Ω–∞—á–µ–Ω–∏—è –∏ –≤—ã–ø–ª–∞—Ç—ã —Å—Ç–∏–ø–µ–Ω–¥–∏–π –∏–º–µ–Ω–∏ –õ.–õ. –õ—é–±–∏–º–æ–≤–∞  —Ñ–∞–∫—É–ª—å—Ç–µ—Ç–∞ —ç–∫–æ–Ω–æ–º–∏—á–µ—Å–∫–∏—Ö –Ω–∞—É–∫ –ù–∞—Ü–∏–æ–Ω–∞–ª—å–Ω–æ–≥–æ 