This code fine-tunes a pretrained BERT (bert-base-uncased) model to classify news headlines into four categories (World, Sports, Business, Sci/Tech) using the AG News dataset.

First, the AG News dataset is loaded and each news text is tokenized using the BERT tokenizer. The text is converted into input IDs and attention masks with padding and truncation so it can be processed by the model.

Next, the BERT model is fine-tuned using transfer learning. A classification head is added on top of BERT, and the model is trained on labeled news data using the Hugging Face Trainer API.

During training, the model is evaluated using accuracy and weighted F1-score, which measure how correctly and consistently the model predicts news categories.

Finally, the trained model is saved and deployed using  Streamlit, allowing users to enter a news headline and get a predicted topic in real time.

In [None]:
import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments
)
from sklearn.metrics import accuracy_score, f1_score


In [None]:
dataset = load_dataset("ag_news")

print(dataset)
print(dataset["train"][0])


In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")


In [None]:
def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns
tokenized_datasets = tokenized_datasets.remove_columns(["text"])

# Rename label column
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")

# Set format for PyTorch
tokenized_datasets.set_format("torch")


In [None]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=4
)


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)

    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")

    return {
        "accuracy": acc,
        "f1": f1
    }


In [None]:
training_args = TrainingArguments(
    output_dir="./bert-ag-news",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


In [None]:
trainer.train()


In [None]:
results = trainer.evaluate()
print(results)


In [None]:
model.save_pretrained("news_bert_model")
tokenizer.save_pretrained("news_bert_model")


In [None]:
results = trainer.evaluate()
print(results)



In [None]:
model.save_pretrained("news_bert_model")
tokenizer.save_pretrained("news_bert_model")


In [None]:
import streamlit as st
from transformers import pipeline

st.title("ðŸ“° News Topic Classifier")

classifier = pipeline(
    "text-classification",
    model="news_bert_model",
    tokenizer="news_bert_model"
)

label_map = {
    "LABEL_0": "World",
    "LABEL_1": "Sports",
    "LABEL_2": "Business",
    "LABEL_3": "Sci/Tech"
}

text = st.text_input("Enter news headline")

if st.button("Predict"):
    result = classifier(text)[0]
    st.success(f"Category: {label_map[result['label']]}")
    st.write(f"Confidence: {result['score']:.2f}")
