In [None]:
from datasets import load_dataset

dataset = load_dataset("cnn_dailymail", "3.0.0")
print("Sample data:", dataset["train"][0])  # Verify data structure

Sample data: {'article': 'LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won\'t cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don\'t plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don\'t think I\'ll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a pub or see the horror film "Hostel: Part II," currently six places below his number one movie on the UK bo

In [None]:
from transformers import AutoTokenizer

model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    inputs = ["summarize: " + article for article in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)
    labels = tokenizer(examples["highlights"], max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [None]:
from transformers import (
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
import os

# Configuration
os.environ["WANDB_DISABLED"] = "true"  # Disable W&B logging
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda")

# Data collator for efficient batching
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    pad_to_multiple_of=8  # Optimized for FP16
)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./flan-t5-summarizer",
    per_device_train_batch_size=4,
    num_train_epochs=1,
    learning_rate=5e-5,
    warmup_steps=500,
    logging_steps=100,
    save_steps=500,
    fp16=True,
    gradient_accumulation_steps=2,
    eval_strategy="no",
    eval_steps=500,
    predict_with_generate=True,
    report_to="none"
)

# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"].select(range(150000)),
    data_collator=data_collator
    )

# Start training
trainer.train()


  trainer = Seq2SeqTrainer(


Step,Training Loss
100,0.0
200,0.0
300,0.0
400,0.0
500,0.0
600,0.0


TrainOutput(global_step=625, training_loss=0.0, metrics={'train_runtime': 218.4607, 'train_samples_per_second': 22.887, 'train_steps_per_second': 2.861, 'total_flos': 929045888040960.0, 'train_loss': 0.0, 'epoch': 1.0})

In [None]:
article = """
NASA's Artemis program aims to return humans to the Moon by 2025.
The mission will include the first woman and person of color to land on the lunar surface.
"""

inputs = tokenizer("summarize: " + article, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=50)
print("Generated Summary:", tokenizer.decode(outputs[0], skip_special_tokens=True))

Generated Summary: The program will include the first woman and a person of color to land on the Moon by 2025.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

model.save_pretrained("/content/drive/MyDrive/flan-t5-summarizer")
tokenizer.save_pretrained("/content/drive/MyDrive/flan-t5-summarizer")
print("Model saved to Google Drive!")

Mounted at /content/drive
Model saved to Google Drive!
