<a href="https://colab.research.google.com/github/umerakbar013/News-Topic-Classifier-Using-BERT-/blob/main/News_Topic_Classifier_Using_BERT_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets evaluate gradio -q


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [15]:
import pandas as pd

# Load your CSV files, skipping the header row
train_df = pd.read_csv("/content/train.csv", header=None, names=["label", "title", "description"], skiprows=1)
test_df = pd.read_csv("/content/test.csv", header=None, names=["label", "title", "description"], skiprows=1)

# Combine title and description into one text field
train_df["text"] = train_df["title"] + " " + train_df["description"]
test_df["text"] = test_df["title"] + " " + test_df["description"]

# Convert labels to int and subtract 1 (BERT expects 0-indexed labels)
train_df["label"] = pd.to_numeric(train_df["label"]) - 1
test_df["label"] = pd.to_numeric(test_df["label"]) - 1

# Reduce dataset for quick training
train_df = train_df.sample(n=1000, random_state=42).reset_index(drop=True)
test_df = test_df.sample(n=500, random_state=42).reset_index(drop=True)


In [16]:
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

class NewsDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=128)
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = NewsDataset(train_df["text"].tolist(), train_df["label"].tolist())
test_dataset = NewsDataset(test_df["text"].tolist(), test_df["label"].tolist())


In [17]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted"),
    }

    training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,   # Only 1 epoch for fast training
    weight_decay=0.01,
    logging_steps=10,
    save_strategy="no"
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.552931,0.882,0.88123
2,No log,0.41277,0.884,0.8833


TrainOutput(global_step=126, training_loss=0.6925161452520461, metrics={'train_runtime': 3387.9944, 'train_samples_per_second': 0.59, 'train_steps_per_second': 0.037, 'total_flos': 131557890048000.0, 'train_loss': 0.6925161452520461, 'epoch': 2.0})

In [18]:
eval_results = trainer.evaluate()
print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")
print(f"F1 Score: {eval_results['eval_f1']:.4f}")


Accuracy: 0.8840
F1 Score: 0.8833


In [19]:
import gradio as gr

labels = ["World", "Sports", "Business", "Sci/Tech"]

def predict_news(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    predicted_label = torch.argmax(outputs.logits, dim=1).item()
    return labels[predicted_label]

gr.Interface(fn=predict_news, inputs="text", outputs="text", title="AG News Classifier").launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://f57a5ae8f2e8358b8b.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




SyntaxError: invalid syntax (ipython-input-20-1837532599.py, line 1)