In [1]:
!pip install transformers
!pip install datasets
!pip install SentencePiece
!pip install accelerate -U

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [1]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset

In [2]:
# Load the dataset
dataset = load_dataset('cnn_dailymail', '3.0.0', split='train[:100]')  # Load a subset of the "cnn_dailymail" dataset
# train_dataset = dataset['train']



In [3]:
# Load the mT5 tokenizer
tokenizer = MT5Tokenizer.from_pretrained('google/mt5-small')

In [4]:
# Preprocessing function
def preprocess_function(examples):
    inputs = examples['article']
    targets = examples['highlights']
    inputs = [input_text for input_text in inputs]
    targets = [target_text for target_text in targets]
    inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=512)
    targets = tokenizer(targets, padding="max_length", truncation=True, max_length=128)
    inputs['labels'] = targets['input_ids']
    return inputs

In [5]:
train_dataset = dataset.map(preprocess_function, batched=True)



In [6]:
train_dataset

Dataset({
    features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 100
})

In [7]:
model = MT5ForConditionalGeneration.from_pretrained('google/mt5-small')

In [9]:
# Define the training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir='./output_dir',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=3,
)

In [10]:
# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8),
    tokenizer=tokenizer,
)

In [11]:
# Start training
trainer.train()



Epoch,Training Loss,Validation Loss
0,No log,34.930889
1,No log,33.855789
2,No log,33.307541


TrainOutput(global_step=18, training_loss=37.098666720920136, metrics={'train_runtime': 119.6251, 'train_samples_per_second': 2.508, 'train_steps_per_second': 0.15, 'total_flos': 152279910973440.0, 'train_loss': 37.098666720920136, 'epoch': 2.88})

In [12]:
model.save_pretrained('./exported_model')

In [13]:
tokenizer.save_pretrained('./exported_tokenizer')

('./exported_tokenizer/tokenizer_config.json',
 './exported_tokenizer/special_tokens_map.json',
 './exported_tokenizer/spiece.model',
 './exported_tokenizer/added_tokens.json')

In [14]:
 # Load the trained model
model = MT5ForConditionalGeneration.from_pretrained('/content/exported_model')

# Load the tokenizer
tokenizer = MT5Tokenizer.from_pretrained('/content/exported_tokenizer')