In [12]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
import os

# 1. Load Data (AG News)
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Preprocessing: Shift labels from 1-4 down to 0-3 for BERT
train_df['Class Index'] = train_df['Class Index'] - 1
test_df['Class Index'] = test_df['Class Index'] - 1

# 2. Tokenization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128)

# Tokenize only the 'Title' column for faster university project testing
train_encodings = tokenize_function(train_df['Title'].tolist())
test_encodings = tokenize_function(test_df['Title'].tolist())

# 3. Torch Dataset Class
class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encodings, train_df['Class Index'].tolist())
test_dataset = NewsDataset(test_encodings, test_df['Class Index'].tolist())

# 4. Model & Metric Definition
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=4)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {'accuracy': acc, 'f1': f1}

# 5. Training Arguments (Fixed eval_strategy & W&B Bypass)
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    eval_strategy="epoch",  # Corrected name
    save_strategy="epoch",
    logging_dir='./logs',
    report_to="none"        # This stops the wandb login prompt
)

# 6. FIXED TRAINER BLOCK (Ensured closing parenthesis)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# 7. Start Training and Save
trainer.train()
trainer.save_model("./news_classifier_model")
tokenizer.save_pretrained("./news_classifier_model")

print("Training finished! Model saved to ./news_classifier_model")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3165,0.294833,0.901974,0.901842


Training finished! Model saved to ./news_classifier_model


In [13]:
import torch

# 1. Define the Dataset class to handle BERT inputs
class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # Convert each feature to a torch tensor
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# 2. Create the Train and Test objects
# Uses the encodings from Table 1
train_dataset = NewsDataset(train_encodings, train_df['Class Index'].tolist())
test_dataset = NewsDataset(test_encodings, test_df['Class Index'].tolist())

# 3. Load BERT with 4 output heads
# (This is where the warning appears - it is safe to ignore)
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=4
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
%%writefile app.py
import streamlit as st
from transformers import pipeline
import os

st.title("üóûÔ∏è News Topic Classifier (BERT)")
st.write("Deployed on Google Colab")

model_path = "./news_classifier_model"

@st.cache_resource
def load_bert_model():
    return pipeline("text-classification", model=model_path, tokenizer=model_path)

if os.path.exists(model_path):
    classifier = load_bert_model()
    user_input = st.text_input("Enter Headline:")
    if user_input:
        prediction = classifier(user_input)
        labels = {"LABEL_0": "World", "LABEL_1": "Sports", "LABEL_2": "Business", "LABEL_3": "Sci/Tech"}
        result = labels.get(prediction[0]['label'], "Unknown")
        st.success(f"Category: {result}")
else:
    st.error("Model not found! Run the training cell first.")

Writing app.py


In [23]:
!pip install gradio



In [25]:
import gradio as gr
from transformers import pipeline
import os

# 1. Path to your trained model
model_path = "./news_classifier_model"

if not os.path.exists(model_path):
    print("Error: Model not found! Please make sure you ran the training code first.")
else:
    # 2. Load your BERT model using the pipeline
    classifier = pipeline("text-classification", model=model_path, tokenizer=model_path)

    # 3. Define the prediction function
    def predict_news_topic(headline):
        prediction = classifier(headline)

        # Mapping labels back to human-readable categories
        label_map = {
            "LABEL_0": "World News",
            "LABEL_1": "Sports News",
            "LABEL_2": "Business News",
            "LABEL_3": "Sci/Tech News"
        }

        result_label = label_map.get(prediction[0]['label'], "Unknown")
        confidence = prediction[0]['score']

        return f"Category: {result_label}\nConfidence: {confidence:.2%}"

    # 4. Create the Gradio Interface
    interface = gr.Interface(
        fn=predict_news_topic,
        inputs=gr.Textbox(lines=2, placeholder="Enter a news headline here...", label="News Headline"),
        outputs=gr.Textbox(label="BERT Prediction"),
        title="üóûÔ∏è Huzaifa's News Topic Classifier",
        description="Type a headline to classify it using your fine-tuned BERT model."
    )

    # 5. Launch (share=True creates the public link automatically!)
    interface.launch(share=True)

Device set to use cuda:0


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://44faa2f664a83e741d.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
