In [1]:
import os
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    MT5ForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

csv_path = '/kaggle/input/dataset/Dataset - Sinhala.csv'
df = pd.read_csv(csv_path)
dataset = Dataset.from_pandas(df).train_test_split(test_size=0.2)

model_name = "/kaggle/input/google-mt5-small/transformers/default/1/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)

def preprocess_function(examples):
    inputs = [
        f"summarize {intent}: {text}" 
        for text, intent in zip(examples["text"], examples["intent"])
    ]
    
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding="longest"
    )
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summary"],
            max_length=150,
            truncation=True,
            padding="max_length"
        )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names
)

training_args = Seq2SeqTrainingArguments(
    output_dir="./mt5-summarizer",
    evaluation_strategy="epoch",
    save_steps=500,
    learning_rate=3e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    num_train_epochs=7,
    predict_with_generate=True,
    fp16=True,
    logging_dir="./logs",
    report_to="none",
    save_total_limit=2
)

class MT5Trainer(Seq2SeqTrainer):
    def save_model(self, output_dir=None, _internal_call=False):
        self.model.save_pretrained(output_dir)
        self.tokenizer.save_pretrained(output_dir)

trainer = MT5Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer
)

trainer.train()

trainer.save_model("./final-mt5-summarizer")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map:   0%|          | 0/552 [00:00<?, ? examples/s]



Map:   0%|          | 0/138 [00:00<?, ? examples/s]

  trainer = MT5Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,No log,2.822853
2,No log,1.614377
3,No log,1.419574
4,No log,1.351598
5,No log,1.337778
6,No log,1.321473
7,No log,1.324477


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


In [2]:
import os
import subprocess
from IPython.display import FileLink, display

def download_file(path, download_file_name):
    os.chdir('/kaggle/working/')
    zip_name = f"/kaggle/working/{download_file_name}.zip"
    command = f"zip {zip_name} {path} -r"
    result = subprocess.run(command, shell=True, capture_output=True, text=True)
    if result.returncode != 0:
        print("Unable to run zip command!")
        print(result.stderr)
        return
    display(FileLink(f'{download_file_name}.zip'))

In [None]:
download_file('/kaggle/working/mt5-summarizer/checkpoint-483', 'fine-tune-mt5-small')