In [None]:
%pip install transformers datasets

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration

model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

In [None]:
from datasets import load_dataset

dataset = load_dataset("json", data_files={"train": "data/train.jsonl", "validation": "data/val.jsonl"})

# Tokenization
def tokenize(batch):
    inputs = tokenizer(batch["text"], padding="max_length", truncation=True, max_length=1024)
    targets = tokenizer(batch["summary"], padding="max_length", truncation=True, max_length=512)
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_dataset = dataset.map(tokenize, batched=True)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./bart-html-finetuned",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    fp16=True  # if using GPU with mixed precision
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
input_text = "The OWASP Top 10 for LLM Applications 2025 outlines major risks..."

inputs = tokenizer(input_text, return_tensors="pt", max_length=1024, truncation=True)
summary_ids = model.generate(inputs["input_ids"], max_length=512, min_length=50, do_sample=False)

html_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(html_summary)