In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder
from torch.cuda.amp import autocast, GradScaler  # Mixed Precision Training

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set save directory
save_path = "D:/App/vscode1/news"
os.makedirs(save_path, exist_ok=True)  # Ensure directory exists

In [3]:
# Load Dataset
dataset = load_dataset("ag_news")

# Extract texts and labels
texts = [item["text"] for item in dataset["train"]]
labels = [item["label"] for item in dataset["train"]]


In [4]:
# Stratified Sampling for Small Dataset
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42)
for train_idx, test_idx in splitter.split(texts, labels):
    small_train_texts = [texts[i] for i in train_idx[:10000]]
    small_train_labels = [labels[i] for i in train_idx[:10000]]
    small_test_texts = [texts[i] for i in test_idx[:5000]]
    small_test_labels = [labels[i] for i in test_idx[:5000]]


In [11]:
model_name = "prajjwal1/bert-tiny"  # A small and fast alternative

from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# Encode Labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(small_train_labels)
y_test = label_encoder.transform(small_test_labels)

In [12]:
# Tokenize Inputs
def tokenize(texts, tokenizer, max_length=512):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt")

train_encodings = tokenize(small_train_texts, tokenizer)
test_encodings = tokenize(small_test_texts, tokenizer)

In [15]:
# Convert to PyTorch Dataset
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}, self.labels[idx]

train_dataset = NewsDataset(train_encodings, y_train)
test_dataset = NewsDataset(test_encodings, y_test)

In [16]:
# DataLoader
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [17]:
# Set Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-1

In [18]:
# Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# Training Loop
def train_model(model, train_loader, criterion, optimizer, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch, labels in train_loader:
            batch = {key: val.to(device) for key, val in batch.items()}
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = model(**batch).logits
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader)}")
        torch.save(model.state_dict(), f"D:\\App\\vscode1\\news\\bert_tiny_news_classifier_epoch{epoch+1}.pth")
        print(f"Model saved for epoch {epoch+1}")

# Train the Model
train_model(model, train_loader, criterion, optimizer)


Epoch 1, Loss: 1.1438976067323654
Model saved for epoch 1
Epoch 2, Loss: 0.6783305663651171
Model saved for epoch 2
Epoch 3, Loss: 0.46302506365715124
Model saved for epoch 3


In [19]:
# Save Final Model
torch.save(model.state_dict(), "D:\\App\\vscode1\\news\\bert_tiny_news_classifier_final.pth")
print("TinyBERT model trained and saved successfully!")

TinyBERT model trained and saved successfully!


In [20]:
from sklearn.metrics import classification_report

# Evaluate Model
model.eval()
predictions, true_labels = [], []

for batch, labels in test_loader:
    batch = {key: val.to(device) for key, val in batch.items()}
    labels = labels.to(device)
    with torch.no_grad():
        outputs = model(**batch).logits
    preds = torch.argmax(outputs, dim=1)
    predictions.extend(preds.cpu().numpy())
    true_labels.extend(labels.cpu().numpy())

# Print Classification Report
print(classification_report(true_labels, predictions))


              precision    recall  f1-score   support

           0       0.91      0.87      0.89      1267
           1       0.95      0.97      0.96      1240
           2       0.85      0.80      0.83      1252
           3       0.82      0.90      0.86      1241

    accuracy                           0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000



In [None]:
#  Define Label Mapping
label_mapping = {
    0: "World",
    1: "Sports",
    2: "Business",
    3: "Science/Tech"
}

#  Prediction Function
def predict_label(model, tokenizer, text):
    model.eval()  # Ensure evaluation mode
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
    
    with torch.no_grad():
        output = model(**inputs).logits
        pred_label = torch.argmax(output, dim=1).cpu().item()
    
    return label_mapping[pred_label]  # Map numeric label to category

#  Example Usage
sample_texts = [
    "Stock markets are experiencing a huge crash this week.",
    "The Mars rover has discovered signs of ancient life.",
    "Argentina wins FIFA World Cup 2026 after an intense match!",
    "New AI technology is revolutionizing the tech industry."
]

for text in sample_texts:
    predicted_label = predict_label(model, tokenizer, text)
    print(f'📌 Input: {text}\n🔹 Predicted Class: {predicted_label}\n')


📌 Input: Stock markets are experiencing a huge crash this week.
🔹 Predicted Class: Business

📌 Input: The Mars rover has discovered signs of ancient life.
🔹 Predicted Class: Science/Tech

📌 Input: Argentina wins FIFA World Cup 2026 after an intense match!
🔹 Predicted Class: Sports

📌 Input: New AI technology is revolutionizing the tech industry.
🔹 Predicted Class: Science/Tech

