## Installing Packages (if needed)

In [None]:
!pip install -q transformers datasets accelerate sentencepiece evaluate rouge-score


## Loading Packages

In [None]:
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
import evaluate
import torch


Checking for GPU

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

## Loading Multi-News Dataset

In [None]:
raw_datasets = load_dataset("Awesome075/multi_news_parquet")
print(raw_datasets)
print(raw_datasets["train"].features)

### Looking at sample dataset

In [None]:
sample = raw_datasets["train"][0]
print(sample["document"][:1000])  # first 1000 chars
print("\n==== SUMMARY ====\n")
print(sample["summary"])

# Loading BART Large CNN model (for text summarization)

In [None]:
model_name = "facebook/bart-large-cnn"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)


Creating preprocessing function(for tokenization)

In [None]:
max_source_length = 1024    # length of news article (feature)
max_target_length = 160     # length of summary (label)

def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["document"],
        max_length=max_source_length,
        truncation=True,
        padding="max_length",
    )

    labels = tokenizer(
        text_target=examples["summary"],
        max_length=max_target_length,
        truncation=True,
        padding="max_length",
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


Tokenize the training and validation (took way to long to evaluate the validation though so kinda scrapped that part)

In [None]:
# maybe ~10k train, 1k val to start
train_size = 10_000
val_size   = 1_000

train_raw_big = raw_datasets["train"].select(range(train_size))
val_raw_big   = raw_datasets["validation"].select(range(val_size))

tokenized_train = train_raw_big.map(
    preprocess_function,
    batched=True,
    remove_columns=train_raw_big.column_names,
)

tokenized_val = val_raw_big.map(
    preprocess_function,
    batched=True,
    remove_columns=val_raw_big.column_names,
)


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


## Fine-Tuning the Model with Multinews

In [None]:
batch_size = 1
gradient_accumulation_steps = 8

training_args = Seq2SeqTrainingArguments(
    output_dir="/content/news_pulse_bart_multinews",
    eval_strategy="steps",
    eval_steps=500,
    logging_steps=100,
    save_strategy="steps",
    save_steps=500,

    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,

    num_train_epochs=1,
    learning_rate=3e-5,
    weight_decay=0.01,
    warmup_steps=500,

    logging_first_step=True,

    predict_with_generate=True,
    generation_max_length=max_target_length,
    generation_num_beams=4,

    fp16=torch.cuda.is_available(),

    report_to="none",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model),
    compute_metrics=None,
)


In [None]:
train_result = trainer.train()
train_result

In [None]:
# choose where to save (local folder)
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
save_dir = PROJECT_ROOT / "models" / "bart_summarizer_finetuned"
save_dir.mkdir(parents=True, exist_ok=True)

trainer.save_model(str(save_dir))
tokenizer.save_pretrained(str(save_dir))

print("Saved to:", save_dir)

### Example of generated summary VS hand written summary

In [None]:
max_source_length = 1024
max_target_length = 160

test_example = raw_datasets["test"][0]

input_text = test_example["document"]
ref_summary = test_example["summary"]

print("REFERENCE SUMMARY:\n", ref_summary[:800], "...\n")

inputs = tokenizer(
    input_text,
    max_length=max_source_length,
    truncation=True,
    return_tensors="pt",
).to(model.device)

summary_ids = model.generate(
    **inputs,
    max_length=max_target_length,
    num_beams=4,
    length_penalty=2.0,
    early_stopping=True,
    no_repeat_ngram_size=3,
)

generated = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("GENERATED SUMMARY:\n", generated)
