In [3]:
!pip install -q transformers datasets evaluate nltk rouge_score py7zr
!pip install -q accelerate
!pip install -q sentencepiece

In [4]:
import os
import numpy as np
import pandas as pd
import torch
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    get_scheduler,
    set_seed
)
from datasets import load_dataset
import evaluate
import nltk
from nltk.tokenize import sent_tokenize
import wandb
import gc
from tqdm.auto import tqdm

In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
SEED = 42
set_seed(SEED)

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

Using cuda device


In [8]:
dataset = load_dataset("EdinburghNLP/xsum")

README.md:   0%|          | 0.00/6.24k [00:00<?, ?B/s]

xsum.py:   0%|          | 0.00/5.76k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/304M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/16.7M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/17.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/204045 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11332 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11334 [00:00<?, ? examples/s]

In [9]:
print(f"Dataset loaded: {dataset}")
print(f"Train set size: {len(dataset['train'])}")
print(f"Validation set size: {len(dataset['validation'])}")
print(f"Test set size: {len(dataset['test'])}")

Dataset loaded: DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})
Train set size: 204045
Validation set size: 11332
Test set size: 11334


In [10]:
sample = dataset["train"][0]
print("\nSample document:")
print(sample["document"][:500] + "...\n")
print("Sample summary:")
print(sample["summary"])


Sample document:
The full cost of damage in Newton Stewart, one of the areas worst affected, is still being assessed.
Repair work is ongoing in Hawick and many roads in Peeblesshire remain badly affected by standing water.
Trains on the west coast mainline face disruption due to damage at the Lamington Viaduct.
Many businesses and householders were affected by flooding in Newton Stewart after the River Cree overflowed into the town.
First Minister Nicola Sturgeon visited the area to inspect the damage.
The water...

Sample summary:
Clean-up operations are continuing across the Scottish Borders and Dumfries and Galloway after flooding caused by Storm Frank.


In [11]:
def analyze_dataset(dataset, split="train", num_samples=1000):
    """Analyze document and summary lengths in the dataset."""
    if num_samples > len(dataset[split]):
        num_samples = len(dataset[split])
    
    doc_lengths = []
    summary_lengths = []
    compression_ratios = []
    
    for i in range(num_samples):
        doc = dataset[split][i]["document"]
        summary = dataset[split][i]["summary"]
        
        doc_words = len(doc.split())
        summary_words = len(summary.split())
        
        doc_lengths.append(doc_words)
        summary_lengths.append(summary_words)
        
        if doc_words > 0:
            compression_ratios.append(summary_words / doc_words)
    
    return {
        "doc_lengths": {
            "mean": np.mean(doc_lengths),
            "median": np.median(doc_lengths),
            "min": np.min(doc_lengths),
            "max": np.max(doc_lengths),
        },
        "summary_lengths": {
            "mean": np.mean(summary_lengths),
            "median": np.median(summary_lengths),
            "min": np.min(summary_lengths),
            "max": np.max(summary_lengths),
        },
        "compression_ratio": {
            "mean": np.mean(compression_ratios),
            "median": np.median(compression_ratios),
        }
    }

In [12]:
analysis = analyze_dataset(dataset)
print("\nDataset Analysis:")
print(f"Document length (words): {analysis['doc_lengths']}")
print(f"Summary length (words): {analysis['summary_lengths']}")
print(f"Compression ratio: {analysis['compression_ratio']}")


Dataset Analysis:
Document length (words): {'mean': 361.979, 'median': 289.0, 'min': 11, 'max': 2694}
Summary length (words): {'mean': 21.101, 'median': 21.0, 'min': 1, 'max': 48}
Compression ratio: {'mean': 0.10330915428072975, 'median': 0.07246376811594203}


In [13]:
MODEL_NAME = "t5-base"  # Options: t5-small, t5-base, t5-large, t5-3b, t5-11b
MAX_SOURCE_LENGTH = 512 
MAX_TARGET_LENGTH = 64

In [14]:
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [15]:
print(f"Model: {MODEL_NAME}")
print(f"Vocabulary size: {tokenizer.vocab_size}")
print(f"Maximum source length: {MAX_SOURCE_LENGTH}")
print(f"Maximum target length: {MAX_TARGET_LENGTH}")

Model: t5-base
Vocabulary size: 32000
Maximum source length: 512
Maximum target length: 64


In [16]:
PREFIX = "summarize: "

In [17]:
def preprocess_function(examples):
    """
    Preprocess the dataset for T5 fine-tuning.
    T5 was trained with the prefix format, so we add "summarize: " before the input text.
    """
    # Add prefix to the inputs
    inputs = [PREFIX + doc for doc in examples["document"]]
    
    # Tokenize inputs and targets
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_SOURCE_LENGTH,
        padding="max_length",
        truncation=True,
    )
    
    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summary"],
            max_length=MAX_TARGET_LENGTH,
            padding="max_length",
            truncation=True,
        )
    
    model_inputs["labels"] = labels["input_ids"]
    
    # Replace padding token id with -100 so it's ignored in loss calculation
    for i in range(len(model_inputs["labels"])):
        model_inputs["labels"][i] = [
            -100 if token == tokenizer.pad_token_id else token 
            for token in model_inputs["labels"][i]
        ]
    
    return model_inputs

In [18]:
tokenized_datasets = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=dataset["train"].column_names,
    desc="Preprocessing dataset",
)

print("Dataset preprocessing completed.")
print(f"Columns in processed dataset: {tokenized_datasets['train'].column_names}")

Preprocessing dataset:   0%|          | 0/204045 [00:00<?, ? examples/s]



Preprocessing dataset:   0%|          | 0/11332 [00:00<?, ? examples/s]

Preprocessing dataset:   0%|          | 0/11334 [00:00<?, ? examples/s]

Dataset preprocessing completed.
Columns in processed dataset: ['input_ids', 'attention_mask', 'labels']


In [19]:
model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
print(f"Model loaded: {MODEL_NAME}")
print(f"Number of parameters: {model.num_parameters():,}")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Model loaded: t5-base
Number of parameters: 222,903,552


In [20]:
model = model.to(device)

In [21]:
batch_size = 8
gradient_accumulation_steps = 4
effective_batch_size = batch_size * gradient_accumulation_steps

In [22]:
training_args = Seq2SeqTrainingArguments(
    output_dir=f"./results/{MODEL_NAME}-xsum",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,  # Mixed precision training
    gradient_accumulation_steps=gradient_accumulation_steps,
    generation_max_length=MAX_TARGET_LENGTH,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    greater_is_better=True,
    warmup_steps=500,
    report_to="none",  # Set to "wandb" if using Weights & Biases
)

print(f"Effective batch size: {effective_batch_size}")

Effective batch size: 32


In [23]:
rouge_metric = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [24]:
def compute_metrics(eval_preds):
    """Compute ROUGE metrics for evaluation."""
    preds, labels = eval_preds
    
    # Decode generated summaries
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    
    # Compute ROUGE scores
    result = rouge_metric.compute(
        predictions=decoded_preds,
        references=decoded_labels,
        use_stemmer=True,
    )
    
    # Extract median scores
    result = {k: round(v * 100, 4) for k, v in result.items()}
    
    return result

In [25]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=-100,
    padding=True
)

In [26]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(


In [29]:
def custom_training_loop(model, tokenizer, train_dataset, val_dataset, num_epochs=3):
    """
    Custom training loop with linear scheduler with warmup.
    This demonstrates more control over the training process compared to using Trainer.
    """
    from torch.utils.data import DataLoader
    from torch.optim import AdamW
    
    # Prepare data loaders
    train_dataloader = DataLoader(
        train_dataset, 
        batch_size=batch_size, 
        shuffle=True, 
        collate_fn=data_collator
    )
    
    val_dataloader = DataLoader(
        val_dataset, 
        batch_size=batch_size, 
        collate_fn=data_collator
    )
    
    # Optimizer
    optimizer = AdamW(model.parameters(), lr=3e-5, weight_decay=0.01)
    
    # Learning rate scheduler
    total_steps = len(train_dataloader) * num_epochs
    warmup_steps = int(0.1 * total_steps)  # 10% of total steps for warmup
    
    lr_scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps,
    )
    
    # Training loop
    progress_bar = tqdm(range(total_steps))
    global_step = 0
    best_rouge1 = 0
    
    for epoch in range(num_epochs):
        # Training
        model.train()
        train_loss = 0
        
        for batch in train_dataloader:
            # Move batch to device
            batch = {k: v.to(device) for k, v in batch.items()}
            
            # Forward pass
            outputs = model(**batch)
            loss = outputs.loss
            
            # Scale loss for gradient accumulation
            loss = loss / gradient_accumulation_steps
            loss.backward()
            
            if (global_step + 1) % gradient_accumulation_steps == 0:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
            
            train_loss += loss.item()
            global_step += 1
            progress_bar.update(1)
            
            # Log training loss
            if global_step % 100 == 0:
                print(f"Step {global_step}: Loss = {train_loss / (global_step % len(train_dataloader) or 1)}")
        
        avg_train_loss = train_loss / len(train_dataloader)
        print(f"Epoch {epoch + 1}/{num_epochs}, Average training loss: {avg_train_loss}")
        
        # Evaluation
        model.eval()
        val_loss = 0
        all_preds = []
        all_labels = []
        original_references = []
        
        # Get actual reference summaries for proper evaluation
        for idx, batch in enumerate(val_dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            
            with torch.no_grad():
                outputs = model(**batch)
                val_loss += outputs.loss.item()
                
                # Generate predictions
                generated_tokens = model.generate(
                    batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                    max_length=MAX_TARGET_LENGTH,
                    num_beams=4,
                    length_penalty=0.6,
                    early_stopping=True,
                )
                
                # Decode generated tokens and labels
                decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
                all_preds.extend(decoded_preds)
                
                # Replace -100 in labels with pad token id for decoding
                labels = batch["labels"].detach().cpu().numpy()
                labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
                decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
                all_labels.extend(decoded_labels)
        
        # Process predictions and references for ROUGE
        processed_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in all_preds]
        processed_labels = ["\n".join(sent_tokenize(label.strip())) for label in all_labels]
        
        # Compute metrics directly with processed text
        rouge_result = rouge_metric.compute(
            predictions=processed_preds,
            references=processed_labels,
            use_stemmer=True,
        )
        
        # Extract and format scores
        metrics = {k: round(v * 100, 4) for k, v in rouge_result.items()}
        
        avg_val_loss = val_loss / len(val_dataloader)
        print(f"Validation loss: {avg_val_loss}")
        print(f"ROUGE scores: {metrics}")
        
        # Save best model
        if metrics["rouge1"] > best_rouge1:
            best_rouge1 = metrics["rouge1"]
            # Save model checkpoint
            torch.save(model.state_dict(), f"./checkpoint_epoch_{epoch+1}.pt")
            print(f"New best model saved with ROUGE-1: {best_rouge1}")
    
    return model

In [30]:
# Uncomment to use custom training loop instead of the Trainer
custom_model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
custom_model = custom_training_loop(
    custom_model, 
    tokenizer, 
    tokenized_datasets["train"], 
    tokenized_datasets["validation"], 
    num_epochs=1
)

  0%|          | 0/25506 [00:00<?, ?it/s]

Step 100: Loss = 0.846891377568245
Step 200: Loss = 0.8476318830251693
Step 300: Loss = 0.8447808194160461
Step 400: Loss = 0.8415350131690502
Step 500: Loss = 0.833954271197319
Step 600: Loss = 0.8264095644156139
Step 700: Loss = 0.8190177027668272
Step 800: Loss = 0.8100372513383627
Step 900: Loss = 0.8013554637961917
Step 1000: Loss = 0.7927718778848648
Step 1100: Loss = 0.7841565965522419
Step 1200: Loss = 0.7749715100228787
Step 1300: Loss = 0.7669051455534421
Step 1400: Loss = 0.7590381034782955
Step 1500: Loss = 0.7510470645427704
Step 1600: Loss = 0.7437663317471742
Step 1700: Loss = 0.7373195025850745
Step 1800: Loss = 0.7316151083509127
Step 1900: Loss = 0.7258688671808494
Step 2000: Loss = 0.7201008120030165
Step 2100: Loss = 0.7148372542432376
Step 2200: Loss = 0.710067094726996
Step 2300: Loss = 0.7054087737850521
Step 2400: Loss = 0.7008772186934948
Step 2500: Loss = 0.6966613448023796
Step 2600: Loss = 0.6927011335698458
Step 2700: Loss = 0.6891120425418571
Step 2800: Lo

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/opt/venv/nltk_data'
    - '/opt/venv/share/nltk_data'
    - '/opt/venv/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [31]:
def generate_summary(text, model, tokenizer, max_length=MAX_TARGET_LENGTH):
    """Generate a summary for the given text."""
    # Prepare input
    input_text = PREFIX + text
    inputs = tokenizer(input_text, return_tensors="pt", max_length=MAX_SOURCE_LENGTH, truncation=True).to(device)
    
    # Generate
    outputs = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=max_length,
        num_beams=4,
        length_penalty=0.6,
        early_stopping=True,
    )
    
    # Decode and return the summary
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

In [32]:
test_samples = dataset["test"].select(range(5))

In [33]:
for i, sample in enumerate(test_samples):
    document = sample["document"]
    reference_summary = sample["summary"]
    
    # Generate summary
    generated_summary = generate_summary(document, model, tokenizer)
    
    print(f"\nExample {i+1}:")
    print(f"Document (truncated): {document[:200]}...")
    print(f"Reference summary: {reference_summary}")
    print(f"Generated summary: {generated_summary}")
    
    # Calculate ROUGE for individual example
    rouge_result = rouge_metric.compute(
        predictions=[generated_summary],
        references=[reference_summary],
        use_stemmer=True,
    )
    
    print(f"ROUGE scores: {rouge_result}")


Example 1:
Document (truncated): Prison Link Cymru had 1,099 referrals in 2015-16 and said some ex-offenders were living rough for up to a year before finding suitable accommodation.
Workers at the charity claim investment in housing...
Reference summary: There is a "chronic" need for more housing for prison leavers in Wales, according to a charity.
Generated summary: prison link cymru says some ex-offenders are living rough for up to a year . charity says investment in housing would be cheaper than jailing homeless repeat offenders . the government says more people than ever are getting help to address housing problems .
ROUGE scores: {'rouge1': 0.27118644067796605, 'rouge2': 0.03508771929824562, 'rougeL': 0.13559322033898302, 'rougeLsum': 0.13559322033898302}

Example 2:
Document (truncated): Officers searched properties in the Waterfront Park and Colonsay View areas of the city on Wednesday.
Detectives said three firearms, ammunition and a five-figure sum of money were recovered.
A

In [34]:
def evaluate_on_test_set(model, test_dataset, tokenizer, batch_size=4):
    """Evaluate the model on the full test set."""
    # Preprocess test set
    preprocessed_test = test_dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=test_dataset.column_names,
        desc="Preprocessing test set",
    )
    
    # Create test dataloader
    test_dataloader = torch.utils.data.DataLoader(
        preprocessed_test,
        batch_size=batch_size,
        collate_fn=data_collator,
    )
    
    # Generate summaries for entire test set
    model.eval()
    all_generated_summaries = []
    all_reference_summaries = []
    
    # Original test data for reference summaries
    original_test_data = dataset["test"]
    
    for i, batch in enumerate(tqdm(test_dataloader, desc="Generating summaries")):
        batch = {k: v.to(device) for k, v in batch.items()}
        
        with torch.no_grad():
            generated_tokens = model.generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
                max_length=MAX_TARGET_LENGTH,
                num_beams=4,
                length_penalty=0.6,
                early_stopping=True,
            )
            
            # Decode generated summaries
            decoded_summaries = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
            all_generated_summaries.extend(decoded_summaries)
            
            # Get reference summaries from original dataset
            start_idx = i * batch_size
            end_idx = min(start_idx + batch_size, len(original_test_data))
            reference_summaries = [original_test_data[j]["summary"] for j in range(start_idx, end_idx)]
            all_reference_summaries.extend(reference_summaries)
    
    # Compute ROUGE scores
    rouge_results = rouge_metric.compute(
        predictions=all_generated_summaries,
        references=all_reference_summaries,
        use_stemmer=True,
    )
    
    # Format results for display
    formatted_results = {k: round(v * 100, 2) for k, v in rouge_results.items()}
    return formatted_results, all_generated_summaries, all_reference_summaries

In [35]:
print("Evaluating on test set...")
test_results, generated_summaries, reference_summaries = evaluate_on_test_set(
    model, dataset["test"], tokenizer
)

Evaluating on test set...


Preprocessing test set:   0%|          | 0/11334 [00:00<?, ? examples/s]



Generating summaries:   0%|          | 0/2834 [00:00<?, ?it/s]

In [36]:
print("Test set evaluation results:")
print(test_results)

Test set evaluation results:
{'rouge1': 20.45, 'rouge2': 3.09, 'rougeL': 13.87, 'rougeLsum': 13.87}


In [37]:
def compare_generation_strategies(text, model, tokenizer, max_length=MAX_TARGET_LENGTH):
    """Compare different generation strategies for the given text."""
    input_text = PREFIX + text
    inputs = tokenizer(input_text, return_tensors="pt", max_length=MAX_SOURCE_LENGTH, truncation=True).to(device)
    
    # Strategy 1: Standard Beam Search
    outputs_beam = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=max_length,
        num_beams=4,
        early_stopping=True,
    )
    
    # Strategy 2: Beam Search with Length Penalty
    outputs_length_penalty = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=max_length,
        num_beams=4,
        length_penalty=0.6,  # < 1.0 favors shorter sequences
        early_stopping=True,
    )
    
    # Strategy 3: Diverse Beam Search
    outputs_diverse_beam = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=max_length,
        num_beams=4,
        num_beam_groups=4,
        diversity_penalty=0.5,  # Promotes diversity between groups
        early_stopping=True,
    )
    
    # Strategy 4: Top-p (Nucleus) Sampling
    outputs_top_p = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,
        max_length=max_length,
        do_sample=True,
        top_p=0.92,
        top_k=0,
        temperature=0.7,
    )
    
    # Decode
    summary_beam = tokenizer.decode(outputs_beam[0], skip_special_tokens=True)
    summary_length_penalty = tokenizer.decode(outputs_length_penalty[0], skip_special_tokens=True)
    summary_diverse_beam = tokenizer.decode(outputs_diverse_beam[0], skip_special_tokens=True)
    summary_top_p = tokenizer.decode(outputs_top_p[0], skip_special_tokens=True)
    
    return {
        "standard_beam": summary_beam,
        "length_penalty": summary_length_penalty,
        "diverse_beam": summary_diverse_beam,
        "top_p_sampling": summary_top_p
    }

In [38]:
sample_doc = dataset["test"][10]["document"]
reference = dataset["test"][10]["summary"]

In [39]:
print("\nComparing different generation strategies:")
print(f"Document (truncated): {sample_doc[:200]}...")
print(f"Reference summary: {reference}")


Comparing different generation strategies:
Document (truncated): The move is in response to an £8m cut in the subsidy received from the Department of Employment and Learning (DEL).
The cut in undergraduate places will come into effect from September 2015.
Job losse...
Reference summary: Queen's University Belfast is cutting 236 jobs and 290 student places due to a funding reduction.


In [40]:
generation_results = compare_generation_strategies(sample_doc, model, tokenizer)

In [41]:
for strategy, summary in generation_results.items():
    print(f"\n{strategy.upper()} strategy:")
    print(summary)
    
    # Calculate ROUGE for individual strategy
    rouge_result = rouge_metric.compute(
        predictions=[summary],
        references=[reference],
        use_stemmer=True,
    )
    
    formatted_rouge = {k: round(v * 100, 2) for k, v in rouge_result.items()}
    print(f"ROUGE scores: {formatted_rouge}")


STANDARD_BEAM strategy:
there are currently around 17,000 full-time undergraduate and postgraduate students at the university, and around 3,800 staff . the university aims to reduce the number of student places by 1,010 over the next three years . there are no immediate plans to close departments or courses .
ROUGE scores: {'rouge1': 15.38, 'rouge2': 3.17, 'rougeL': 15.38, 'rougeLsum': 15.38}

LENGTH_PENALTY strategy:
there are currently around 17,000 full-time undergraduate and postgraduate students at the university, and around 3,800 staff . the university aims to reduce the number of student places by 1,010 over the next three years . there are no immediate plans to close departments or courses .
ROUGE scores: {'rouge1': 15.38, 'rouge2': 3.17, 'rougeL': 15.38, 'rougeLsum': 15.38}

DIVERSE_BEAM strategy:
the move is in response to an £8m cut in the subsidy received from the department of employment and learning (DEL) there are currently around 17,000 full-time undergraduate and post

In [42]:
def compare_task_prefixes(text, model, tokenizer, max_length=MAX_TARGET_LENGTH):
    """Compare different task prefixes for T5 summarization."""
    prefixes = [
        "summarize: ",
        "generate summary: ",
        "tl;dr: ",
        "summarization: ",
    ]
    
    results = {}
    
    for prefix in prefixes:
        input_text = prefix + text
        inputs = tokenizer(input_text, return_tensors="pt", max_length=MAX_SOURCE_LENGTH, truncation=True).to(device)
        
        outputs = model.generate(
            inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_length=max_length,
            num_beams=4,
            length_penalty=0.6,
            early_stopping=True,
        )
        
        summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
        results[prefix] = summary
    
    return results

In [43]:
print("\nComparing different T5 task prefixes:")
prefix_results = compare_task_prefixes(sample_doc, model, tokenizer)

for prefix, summary in prefix_results.items():
    print(f"\nPrefix: '{prefix}'")
    print(f"Generated summary: {summary}")
    
    # Calculate ROUGE for individual prefix
    rouge_result = rouge_metric.compute(
        predictions=[summary],
        references=[reference],
        use_stemmer=True,
    )
    
    formatted_rouge = {k: round(v * 100, 2) for k, v in rouge_result.items()}
    print(f"ROUGE scores: {formatted_rouge}")


Comparing different T5 task prefixes:

Prefix: 'summarize: '
Generated summary: there are currently around 17,000 full-time undergraduate and postgraduate students at the university, and around 3,800 staff . the university aims to reduce the number of student places by 1,010 over the next three years . there are no immediate plans to close departments or courses .
ROUGE scores: {'rouge1': 15.38, 'rouge2': 3.17, 'rougeL': 15.38, 'rougeLsum': 15.38}

Prefix: 'generate summary: '
Generated summary: queen's vice-chancellor Patrick Johnston said the cuts had the potential to damage the reputation of the university . there are currently around 17,000 full-time undergraduate and postgraduate students at the university, and around 3,800 staff .
ROUGE scores: {'rouge1': 25.0, 'rouge2': 3.7, 'rougeL': 17.86, 'rougeLsum': 17.86}

Prefix: 'tl;dr: '
Generated summary: queen's vice-chancellor Patrick Johnston says the cuts have the potential to damage the reputation of the university . there are cu

In [44]:
!mkdir -p ./model_export

In [45]:
tokenizer.save_pretrained("./model_export/tokenizer")

('./model_export/tokenizer/tokenizer_config.json',
 './model_export/tokenizer/special_tokens_map.json',
 './model_export/tokenizer/spiece.model',
 './model_export/tokenizer/added_tokens.json')

In [46]:
model_path = "./model_export/model"
model.save_pretrained(model_path)