In [1]:
import os
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate

In [2]:
data = pd.read_csv("clean.csv")  # should contain 'text' and 'title' columns
data = data.dropna(subset=['text', 'title'])

# Optional: rename columns for clarity
data = data.rename(columns={"text": "article", "title": "headline"})

# Create HuggingFace dataset
raw_dataset = Dataset.from_pandas(data)
dataset = raw_dataset.train_test_split(test_size=0.1)

In [3]:
# ------------------------------------------
# Load Tokenizer and Model
# ------------------------------------------
model_checkpoint = "facebook/bart-base"  # or try "google/pegasus-xsum", "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)


In [4]:
# ------------------------------------------
# Preprocessing
# ------------------------------------------
max_input_length = 512
max_target_length = 64

prefix = "clickbait title: "

def preprocess(examples):
    inputs = [prefix + article for article in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    labels = tokenizer(examples["headline"], max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess, batched=True, remove_columns=dataset["train"].column_names)


Map:   0%|          | 0/576 [00:00<?, ? examples/s]

Map:   0%|          | 0/65 [00:00<?, ? examples/s]

In [5]:
# ------------------------------------------
# Metrics for Evaluation
# ------------------------------------------
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {k: round(v, 4) for k, v in result.items()}


In [9]:
# ------------------------------------------
# Define Training Arguments
# ------------------------------------------
training_args = Seq2SeqTrainingArguments(
    output_dir="./clickbait_model",
    do_train=True,
    do_eval=True,
    eval_steps=500,
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=10,
    push_to_hub=False,
)


In [10]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Seq2SeqTrainer(


Step,Training Loss
10,2.9349
20,2.7292
30,3.3706
40,2.6696
50,2.613
60,2.5135
70,2.3654
80,2.0993
90,2.0529
100,1.9352




TrainOutput(global_step=216, training_loss=2.195632497469584, metrics={'train_runtime': 197.6951, 'train_samples_per_second': 8.741, 'train_steps_per_second': 1.093, 'total_flos': 526812299919360.0, 'train_loss': 2.195632497469584, 'epoch': 3.0})

In [11]:
# ------------------------------------------
# Save Model and Tokenizer
# ------------------------------------------
trainer.save_model("./clickbait_model")
tokenizer.save_pretrained("./clickbait_model")


('./clickbait_model/tokenizer_config.json',
 './clickbait_model/special_tokens_map.json',
 './clickbait_model/vocab.json',
 './clickbait_model/merges.txt',
 './clickbait_model/added_tokens.json',
 './clickbait_model/tokenizer.json')

In [14]:
def generate_clickbait(article_text):
    input_text = prefix + article_text
    inputs = tokenizer(input_text, return_tensors="pt", max_length=max_input_length, truncation=True)
    inputs = {k: v.to("cpu") for k, v in inputs.items()}
    model.to("cpu")
    outputs = model.generate(**inputs, max_length=max_target_length)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [17]:
article = """Country Road is fighting for its future largely thanks to a cost-cutting decision the company made more than 20 years ago, an expert says. The once-beloved Aussie brand is in clear trouble, with Country Road Group reporting in March its earnings were down almost 72 per cent at $14.2million for the last half of 2024.

One of its longstanding flagship stores at Sydney CBD's Queen Victoria Building has shut up shop, as has sister brand Trenery in Mosman, on Sydney's affluent lower north shore. Another CBD store in Sydney's Pitt Street Mall is expected to close when its lease expires in three years' time.

The video player is currently playing an ad. You can skip the ad in 5 sec with a mouse or keyboard
Queensland University of Technology marketing expert Gary Mortimer said Country Road had lost its iconic Australian lifestyle brand status since Woolworths Holdings took a controlling stake in the late 90s.

A cost-cutting move to manufacture offshore gradually eroded its 'Made in Australia' appeal and weakened the brand's authenticity, Professor Mortimer said.

'Since its launch in the mid-1970s, Country Road clothing was primarily made in Australia, specifically, the iconic chambray shirt which I and nearly every other young man had during that time was made here using Australian cotton,' he said.

'The company emphasised its commitment to Australian manufacturing during that time. 

'Much of that production has shifted to Bangladesh, China, India and Pakistan under new ownership, essentially losing the essence of what Country Road stood for.' """



In [18]:
article

"Country Road is fighting for its future largely thanks to a cost-cutting decision the company made more than 20 years ago, an expert says. The once-beloved Aussie brand is in clear trouble, with Country Road Group reporting in March its earnings were down almost 72 per cent at $14.2million for the last half of 2024.\n\nOne of its longstanding flagship stores at Sydney CBD's Queen Victoria Building has shut up shop, as has sister brand Trenery in Mosman, on Sydney's affluent lower north shore. Another CBD store in Sydney's Pitt Street Mall is expected to close when its lease expires in three years' time.\n\nThe video player is currently playing an ad. You can skip the ad in 5 sec with a mouse or keyboard\nQueensland University of Technology marketing expert Gary Mortimer said Country Road had lost its iconic Australian lifestyle brand status since Woolworths Holdings took a controlling stake in the late 90s.\n\nA cost-cutting move to manufacture offshore gradually eroded its 'Made in A

In [19]:
print(generate_clickbait(article))

The Country Road brand is in clear trouble thanks to cost-cutting decision made 20 years ago
