In [None]:
!pip install torch datasets pandas transformers sentencepiece accelerate

In [None]:
import torch
from datasets import Dataset
import pandas as pd
from transformers import (
    MT5ForConditionalGeneration,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)

def main():

    try:
        df = pd.read_csv("Summarization_dataset.csv", engine='python', on_bad_lines='skip')
    except FileNotFoundError:
        print("Error: 'Summarization_dataset.csv' not found.")
        print("Please make sure the dataset file is uploaded to your Colab environment and the name matches exactly.")
        return

    # Convert the pandas DataFrame to a Hugging Face Dataset
    dataset = Dataset.from_pandas(df)


    model_name = "google/mt5-base"

    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model = MT5ForConditionalGeneration.from_pretrained(model_name)


    prefix = "summarize: "
    max_input_length = 512
    max_target_length = 150

    def preprocess_function(examples):
        """Tokenizes the dataset."""

        inputs = [prefix + str(article) for article in examples["article"]]


        model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")

 .
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(examples["highlights"], max_length=max_target_length, truncation=True, padding="max_length")

        model_inputs["labels"] = labels["input_ids"]
        return model_inputs


    tokenized_dataset = dataset.map(
        preprocess_function,
        batched=True,
    )

    training_args = Seq2SeqTrainingArguments(
        output_dir="./results_mt5_summarization",    # Directory to save the model and results
        num_train_epochs=50,                   # Total number of training epochs
        per_device_train_batch_size=2,         # Batch size per device during training
        per_device_eval_batch_size=2,          # Batch size for evaluation (if used)
        warmup_steps=50,                       # Number of warmup steps for learning rate scheduler
        weight_decay=0.01,                     # Strength of weight decay
        logging_dir='./logs_summarization',    # Directory for storing logs
        logging_steps=10,
        save_total_limit=2,                    # Only keep the last 2 saved models
        predict_with_generate=True,            # Whether to use generate to calculate generative metrics
        report_to="none",                      # Disable integration with Weights & Biases
    )

    # Data collator prepares batches of data for the model.
    data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

    # The Trainer class handles the training and evaluation loop.
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # --- Start Fine-Tuning ---
    print("Starting the fine-tuning process...")
    trainer.train()
    print("Fine-tuning complete.")

    # -- Save the Fine-Tuned Model ---
    final_model_path = "./fine_tuned_mt5_summarization"
    trainer.save_model(final_model_path)
    tokenizer.save_pretrained(final_model_path)
    print(f"Model saved to {final_model_path}")

    # --- check result ---
    print("\n--- Running Inference with the Fine-Tuned Model ---")

    # Load the fine-tuned model and tokenizer
    trained_model = MT5ForConditionalGeneration.from_pretrained(final_model_path)
    trained_tokenizer = AutoTokenizer.from_pretrained(final_model_path)

    # Define a new article to summarize
    article_to_summarize = "Scientists have discovered a new species of glowing frog in a remote rainforest. The frog, which emits a soft blue light, has unique bioluminescent properties that are not fully understood. Researchers believe this could lead to new advancements in medical imaging. The ecosystem where the frog was found is incredibly delicate and under threat from deforestation."

    # Prepare the input for the model
    prompt = f"summarize: {article_to_summarize}"
    inputs = trained_tokenizer(prompt, return_tensors="pt").input_ids

    # Generate the output
    print("Generating summary...")
    outputs = trained_model.generate(
        inputs,
        max_length=150,
        num_beams=4,
        early_stopping=True
    )

    # Decode and print the result
    generated_text = trained_tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("\nOriginal Article:")
    print(article_to_summarize)
    print("\nGenerated Summary:")
    print(generated_text)


if __name__ == "__main__":
    main()