In [1]:
import os
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate

In [2]:
data = pd.read_csv("clean.csv")  # should contain 'text' and 'title' columns
data = data.dropna(subset=['text', 'title'])

# Optional: rename columns for clarity
data = data.rename(columns={"text": "article", "title": "headline"})

# Create HuggingFace dataset
raw_dataset = Dataset.from_pandas(data)
dataset = raw_dataset.train_test_split(test_size=0.1)

In [3]:
len(data)

3354

In [4]:
data.head()

Unnamed: 0,headline,article,url
0,Ioan Gruffudd dealt new blow in bitter divorce...,"A month ago, Ioan Gruffudd revealed shocking t...",https://www.dailymail.co.uk/tvshowbiz/article-...
1,I was on Love Is Blind UK and even got engaged...,A Love Is Blind UK star revealed that they got...,https://www.dailymail.co.uk/tvshowbiz/tv/artic...
2,Emotional Zoe Ball reveals how gardening helpe...,Zoe Ball has opened up about how gardening hel...,https://www.dailymail.co.uk/tvshowbiz/article-...
3,John Travolta honors late wife while Emily Rat...,John Travolta honored his late wife Kelly Pres...,https://www.dailymail.co.uk/tvshowbiz/tvshowbi...
4,Margot Robbie stuns in a corset mini dress as ...,Margot Robbie looked stunning as she joined he...,https://www.dailymail.co.uk/tvshowbiz/article-...


In [5]:
# ------------------------------------------
# Load Tokenizer and Model
# ------------------------------------------
model_checkpoint = "facebook/bart-base"  # or try "google/pegasus-xsum", "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)


In [6]:
# ------------------------------------------
# Preprocessing
# ------------------------------------------
max_input_length = 512
max_target_length = 64

prefix = "generate clickbait headline: "

def preprocess(examples):
    inputs = [prefix + article for article in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    labels = tokenizer(examples["headline"], max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess, batched=True, remove_columns=dataset["train"].column_names)


Map:   0%|          | 0/3018 [00:00<?, ? examples/s]

Map:   0%|          | 0/336 [00:00<?, ? examples/s]

In [7]:
# ------------------------------------------
# Metrics for Evaluation
# ------------------------------------------
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {k: round(v, 4) for k, v in result.items()}


In [8]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./clickbait_model",
    do_train=True,
    do_eval=True,

    # core
    learning_rate=4e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    predict_with_generate=True,
    save_total_limit=2,

    # logging (force first-step log and visible tqdm)
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=10,
    logging_first_step=True,
    disable_tqdm=False,          # <-- show live loss from tqdm

    # evaluation (your version uses eval_steps directly)
    eval_steps=500,

    # safeguards that also fix several zero-loss logging cases
    remove_unused_columns=False,
    label_names=["labels"],

    # MPS nicety
    dataloader_pin_memory=False,

    # misc
    push_to_hub=False,
    seed=42,
)


In [9]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Seq2SeqTrainer(


Step,Training Loss
1,4.2051
10,3.916
20,2.9466
30,2.5997
40,2.6953
50,2.7629
60,2.5157
70,2.7383
80,2.4489
90,2.6908




TrainOutput(global_step=1134, training_loss=2.1531106088527294, metrics={'train_runtime': 1012.883, 'train_samples_per_second': 8.939, 'train_steps_per_second': 1.12, 'total_flos': 2760276946452480.0, 'train_loss': 2.1531106088527294, 'epoch': 3.0})

In [19]:
from torch.utils.data import DataLoader
import torch

model.train()
dl = DataLoader(tokenized_dataset["train"], batch_size=8,
                collate_fn=DataCollatorForSeq2Seq(tokenizer, model=model))
batch = next(iter(dl))
with torch.no_grad():
    out = model(**{k: v.to(model.device) for k, v in batch.items() if k in ["input_ids","attention_mask","labels","decoder_input_ids"]})
print("manual batch loss:", float(out.loss))


manual batch loss: nan


In [13]:
# 1) Peek a real batch and make sure labels aren’t all -100
from torch.utils.data import DataLoader
dl = DataLoader(tokenized_dataset["train"], batch_size=8, collate_fn=data_collator)
batch = next(iter(dl))
print({k: (v.shape, v.dtype) for k, v in batch.items()})
import torch
all_ignored_per_example = (batch["labels"] == -100).all(dim=1)
print("labels all -100 per example:", all_ignored_per_example.tolist())
print("fraction of tokens kept in labels:",
      float((batch["labels"] != -100).sum()) / batch["labels"].numel())


{'input_ids': (torch.Size([8, 512]), torch.int64), 'attention_mask': (torch.Size([8, 512]), torch.int64), 'labels': (torch.Size([8, 50]), torch.int64), 'decoder_input_ids': (torch.Size([8, 50]), torch.int64)}
labels all -100 per example: [False, False, False, False, False, False, False, False]
fraction of tokens kept in labels: 0.575


In [9]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Seq2SeqTrainer(


Step,Training Loss
10,4.0258
20,2.8565
30,2.2718
40,2.5854
50,3.1282
60,2.5208
70,2.2527
80,1.9782
90,1.8287
100,1.963




TrainOutput(global_step=216, training_loss=2.168049344310054, metrics={'train_runtime': 192.0611, 'train_samples_per_second': 8.997, 'train_steps_per_second': 1.125, 'total_flos': 526812299919360.0, 'train_loss': 2.168049344310054, 'epoch': 3.0})

In [10]:
# ------------------------------------------
# Save Model and Tokenizer
# ------------------------------------------
trainer.save_model("./clickbait_model")
tokenizer.save_pretrained("./clickbait_model")


('./clickbait_model/tokenizer_config.json',
 './clickbait_model/special_tokens_map.json',
 './clickbait_model/vocab.json',
 './clickbait_model/merges.txt',
 './clickbait_model/added_tokens.json',
 './clickbait_model/tokenizer.json')

In [11]:
def generate_clickbait(article_text):
    input_text = prefix + article_text
    inputs = tokenizer(input_text, return_tensors="pt", max_length=max_input_length, truncation=True)
    inputs = {k: v.to("cpu") for k, v in inputs.items()}
    model.to("cpu")
    outputs = model.generate(**inputs, max_length=max_target_length)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [12]:
article = """Country Road is fighting for its future largely thanks to a cost-cutting decision the company made more than 20 years ago, an expert says. The once-beloved Aussie brand is in clear trouble, with Country Road Group reporting in March its earnings were down almost 72 per cent at $14.2million for the last half of 2024.

One of its longstanding flagship stores at Sydney CBD's Queen Victoria Building has shut up shop, as has sister brand Trenery in Mosman, on Sydney's affluent lower north shore. Another CBD store in Sydney's Pitt Street Mall is expected to close when its lease expires in three years' time.

The video player is currently playing an ad. You can skip the ad in 5 sec with a mouse or keyboard
Queensland University of Technology marketing expert Gary Mortimer said Country Road had lost its iconic Australian lifestyle brand status since Woolworths Holdings took a controlling stake in the late 90s.

A cost-cutting move to manufacture offshore gradually eroded its 'Made in Australia' appeal and weakened the brand's authenticity, Professor Mortimer said.

'Since its launch in the mid-1970s, Country Road clothing was primarily made in Australia, specifically, the iconic chambray shirt which I and nearly every other young man had during that time was made here using Australian cotton,' he said.

'The company emphasised its commitment to Australian manufacturing during that time. 

'Much of that production has shifted to Bangladesh, China, India and Pakistan under new ownership, essentially losing the essence of what Country Road stood for.' """



In [12]:
article

"Country Road is fighting for its future largely thanks to a cost-cutting decision the company made more than 20 years ago, an expert says. The once-beloved Aussie brand is in clear trouble, with Country Road Group reporting in March its earnings were down almost 72 per cent at $14.2million for the last half of 2024.\n\nOne of its longstanding flagship stores at Sydney CBD's Queen Victoria Building has shut up shop, as has sister brand Trenery in Mosman, on Sydney's affluent lower north shore. Another CBD store in Sydney's Pitt Street Mall is expected to close when its lease expires in three years' time.\n\nThe video player is currently playing an ad. You can skip the ad in 5 sec with a mouse or keyboard\nQueensland University of Technology marketing expert Gary Mortimer said Country Road had lost its iconic Australian lifestyle brand status since Woolworths Holdings took a controlling stake in the late 90s.\n\nA cost-cutting move to manufacture offshore gradually eroded its 'Made in A

In [13]:
print(generate_clickbait(article))

Country Road's future is in clear trouble thanks to cost-cutting move to manufacture overseas


In [25]:
print(generate_clickbait(article))

Country Road's iconic brand is in clear trouble thanks to cost-cutting


In [20]:
print(generate_clickbait(article))

'Made in Australia' brand is in trouble thanks to cost-cutting decision 20 years ago


In [36]:
print(generate_clickbait(article))

The truth about Country Road: How it has lost its Aussie identity... thanks to a cost-cutting decision 20 years ago


In [13]:
print(generate_clickbait(article))

Country Road is in clear trouble thanks to cost-cutting decision the company made 20 years ago


In [18]:
print(generate_clickbait_baseline(article))


clickbait title: Country Road is fighting for its future largely thanks to a cost-cutting decision the company made more than 20 years ago, an expert says. The once-beloved Aussie brand is in clear trouble, with Country Road Group reporting in March its earnings were down almost 72 per cent


In [14]:
test = """
"Jessie J is reportedly planning to mark her TV comeback with a special appearance on Strictly Come Dancing this September following her mastectomy.

The singer, 37, had surgery two weeks ago and confirmed to her fans that 'no cancer spread', after revealing she had been diagnosed with breast cancer in June.

According to The Sun, BBC bosses believe she will kick off the new series perfectly as they welcome her return.

A source said: 'Jessie is putting her health and recovery first but is already excited for getting back on stage.

'She’s been booked for a TV comeback on Strictly in September and bosses can’t wait to have her on the show.

'Right now she’s feeling very grateful for life and all the opportunities coming her way.'

Jessie J, 37, is reportedly planning to mark her TV comeback with a special appearance on Strictly Come Dancing this September following her mastectomy (pictured in June)

The singer had surgery two weeks ago and confirmed to her fans that 'no cancer spread', after revealing she had been diagnosed with breast cancer in June

MailOnline has contacted Strictly and Jessie J's representatives for comment.

As well as the reported Strictly appearance, Jessie is also booked to perform at Radio 2 In The Park in September in Chelmsford.

Earlier this month, Jessie shared an adorable clip of her two-year-old son Sky the night before the operation.

In the sweet clip, Jessie is heard encouraging her boy to say 'Mummy's going to be okay' as she prepared for the surgery to remove her breast.

In the caption, she penned: 'AND.. I AM OK. Results = I have NO cancer spread. Happy tears are real', followed by several crying emojis.

The Price Tag hitmaker continued: 'Thank YOU for the prayers, the love, the well wishes, the joy and all the positive energy.

'This video is from the night before my surgery. We called it baby boy. You are my biggest ray of light and with you in my life, the darkness will never win.

'Lots of healing to go and one more surgery to make these cousins look more like sisters, but for now it's gratitude time and I am changing my name to The LopJess monster.'

BBC bosses believe she will kick off the new series perfectly as they welcome her return (pictured hosts Claudia Winkleman and Tess Daly)

It comes after Jessie revealed she wished she said goodbye to her breast before undergoing a mastectomy.

In an update last month, she said that she is experiencing some 'delayed sadness' and felt 'disappointed' that she didn't say goodbye.

Taking to her Instagram stories, she shared a candid post with fans, bravely opening up about her feelings.

The performer admitted that she went into 'survival mode' when she first found out about her diagnosis and is now letting herself be angry and sad.

In her post, Jessie wrote: '2 weeks post surgery. Had my drain out 2 nights ago. She said breathe in and take a hard breathe out. She whipped that thang out so quick. Woii oii. Weirdest feeling. But so nice to have it out after 12 days.

'Now it’s just me and my wonky boobs trying to figure out how to dress until I match them up. The left one is looking at me like ""you ok babe?""'

She continued: 'Also my experience was when I was diagnosed I went into survival mode.

'There was so much going on with appointments and in my mind esp with a toddler I had just moved and was about to start this campaign after 8 years away. Mad.

Earlier this month, Jessie shared an adorable clip of her two-year-old son Sky the night before the operation

In the caption, she penned: 'AND.. I AM OK. Results = I have NO cancer spread. Happy tears are real', followed by several crying emojis

'I didn’t really have a lot of time to process what was happening or what was going to happen. So I’m currently experiencing some delayed sadness and frustration by having time to process what IS happening.

Jessie added: 'A little disappointed in myself I didn’t say goodbye to my old boob enough. Sounds silly but that’s where I’m at. Again that’s my journey. I’m sure others feel different.

'But for me I didn’t think beyond the surgery. I was just being strong. Well now I’m here and letting myself be angry and sad and all the things. Just for a few days.

'Then I will sew some padding in a bra to even them out order some t-shirts and crack tf on'."
"""

In [15]:
print(generate_clickbait(test))

Jessie J set to mark TV comeback following breast cancer diagnosis following 'no cancer spread'


In [26]:
print(generate_clickbait(test))

Jessie J 'will mark TV comeback with a special appearance on Strictly' after breast cancer diagnosis


In [22]:
print(generate_clickbait(test))

Jessie J, 37, plans to mark her TV comeback with a special appearance on Strictly Come Dancing following breast cancer diagnosis


In [37]:
print(generate_clickbait(test))

BBC bosses convinced Jessie J to stage dramatic TV comeback with a special Strictly appearance following her mastectomy


In [16]:
# Aug 21
print(generate_clickbait(test))

Jessie J 'is planning to mark her TV comeback with a special appearance on Strictly' following breast cancer diagnosis


In [None]:
Jessie J 'set for huge TV comeback' following successful mastectomy amid breast cancer diagnosis

In [17]:
print(generate_clickbait_baseline(test))


clickbait title: 0000000000000000"Jessie J is reportedly planning to mark her TV comeback with a special appearance on Strictly Come Dancing this September following her mastectomy.The singer, 37, had surgery two weeks ago and confirmed to her fans that 'no cancer spread', after revealing she had


In [16]:

# Load original (unfined) model
model_ckpt = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)

def generate_clickbait_baseline(article_text):
    prefix = "clickbait title: "  # Only needed if your fine-tuned model used a prefix
    input_text = prefix + article_text
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    outputs = model.generate(**inputs, max_length=64, num_beams=4)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example
test_article = "Kate Middleton appeared at Wimbledon with her daughter Charlotte and caught fans by surprise."
print(generate_clickbait_baseline(test_article))


clickbait title: Kate Middleton appeared at Wimbledon with her daughter Charlotte and caught fans by surprise.


In [None]:
print(generate_clickbait_baseline(test_article))
