Importing required dependency

In [3]:
import os
import json
import torch
import numpy as np
import wandb
import psutil
import time
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from dataclasses import dataclass
from typing import Optional, List, Dict, Tuple
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, AutoModel, get_linear_schedule_with_warmup
)
from datasets import load_dataset
import torch.nn as nn
from torch.optim import AdamW

Define Configurations

In [4]:
from dataclasses import dataclass
from typing import Optional

@dataclass
class Config:
    model_name: str = "bert-base-uncased"
    num_labels: int = 4
    max_length: int = 128
    train_batch_size: int = 64
    eval_batch_size: int = 32
    learning_rate: float = 2e-5
    weight_decay: float = 0.01
    num_epochs: int = 4
    warmup_steps: int = 500
    gradient_accumulation_steps: int = 2
    train_size: float = 0.8
    random_seed: int = 42
    num_workers: int = 4
    data_dir: str = "data"
    model_dir: str = "models"
    output_dir: str = "outputs"
    patience: int = 3
    min_delta: float = 1e-4
    device: Optional[str] = None
    log_interval: int = 100
    eval_interval: int = 500
    fp16: bool = False
    max_grad_norm: float = 1.0

config = Config()


Data processing

In [5]:
def clean_text(text: str) -> str:
    return text.strip().replace("\\n", " ").replace("\\", "")

In [6]:
class AGNewsDataset(Dataset):
    def __init__(self, texts: List[str], labels: List[int], tokenizer, max_length: int):
        self.texts = [clean_text(t) for t in texts]
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def load_and_process_data(config) -> Tuple[DataLoader, DataLoader, DataLoader]:
    dataset = load_dataset("ag_news")
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    train_val = dataset["train"].train_test_split(test_size=0.2, seed=config.random_seed)
    test = dataset["test"]

    train_set = AGNewsDataset(train_val["train"]["text"], train_val["train"]["label"], tokenizer, config.max_length)
    val_set = AGNewsDataset(train_val["test"]["text"], train_val["test"]["label"], tokenizer, config.max_length)
    test_set = AGNewsDataset(test["text"], test["label"], tokenizer, config.max_length)

    return (
        DataLoader(train_set, batch_size=config.train_batch_size, shuffle=True),
        DataLoader(val_set, batch_size=config.eval_batch_size),
        DataLoader(test_set, batch_size=config.eval_batch_size)
    )


Define Model

In [7]:
class BERTClassifier(nn.Module):
    def __init__(self, model_name: str, num_labels: int):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.classifier(outputs.last_hidden_state[:, 0, :])
        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits, labels)
        return {"loss": loss, "logits": logits}

    def save_pretrained(self, path):
        self.bert.save_pretrained(path)
        torch.save(self.classifier.state_dict(), f"{path}/classifier.pt")

    @classmethod
    def from_pretrained(cls, path, num_labels):
        model = cls(path, num_labels)
        model.classifier.load_state_dict(torch.load(f"{path}/classifier.pt"))
        return model

Define training architecture

In [8]:
def train(config, model_name):
    wandb.init(project="ag-news-bert-finetune", config=config.__dict__)
    config.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    torch.manual_seed(config.random_seed)
    np.random.seed(config.random_seed)

    train_loader, val_loader, test_loader = load_and_process_data(config)
    model = BERTClassifier(model_name, config.num_labels).to(config.device)

    optimizer = AdamW(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
    total_steps = len(train_loader) * config.num_epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, config.warmup_steps, total_steps)

    best_val_loss = float('inf')
    best_test_loss = float('inf')  
    patience_counter = 0
    train_losses, val_losses = [], []

    for epoch in range(config.num_epochs):
        model.train()
        total_train_loss, train_steps = 0, 0
        for step, batch in enumerate(tqdm(train_loader, desc=f"Epoch {epoch+1}")):
            batch = {k: v.to(config.device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs['loss'] / config.gradient_accumulation_steps
            loss.backward()

            if (step + 1) % config.gradient_accumulation_steps == 0:
                nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

            total_train_loss += loss.item() * config.gradient_accumulation_steps
            train_steps += 1

            if step % config.log_interval == 0:
                wandb.log({
                    "train_loss": loss.item(),
                    "learning_rate": scheduler.get_last_lr()[0],
                    "memory_usage_mb": get_memory_usage()['rss']
                })

            if step % config.eval_interval == 0:
                test_loss, test_metrics = evaluate(model, test_loader, config)
                wandb.log({
                    "test_loss": test_loss,
                    "test_accuracy": test_metrics['accuracy'],
                    "test_f1": test_metrics['f1']
                })
                if test_loss < best_test_loss:
                    best_test_loss = test_loss
                    os.makedirs(config.model_dir, exist_ok=True)
                    model.save_pretrained(f"{config.model_dir}/best_model")

        avg_train_loss = total_train_loss / train_steps
        train_losses.append(avg_train_loss)

        val_loss, val_metrics = evaluate(model, val_loader, config)
        val_losses.append(val_loss)
        
        wandb.log({"epoch": epoch + 1, "train_loss_epoch": avg_train_loss, "val_loss": val_loss, **val_metrics})

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            os.makedirs(config.model_dir, exist_ok=True)
            model.save_pretrained(f"{config.model_dir}/best_model")
        else:
            patience_counter += 1
            if patience_counter >= config.patience:
                print("Early stopping.")
                break

    return model, train_losses, val_losses, test_loader



Evaluation

In [9]:
def compute_metrics(preds, labels):
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average='weighted'),
        "confusion_matrix": confusion_matrix(labels, preds)
    }
def get_memory_usage():
    process = psutil.Process(os.getpid())
    return {'rss': process.memory_info().rss // (1024 * 1024)}
    
def evaluate(model, dataloader, config):
    model.eval()
    all_preds, all_labels = [], []
    total_loss = 0

    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(config.device) for k, v in batch.items()}
            outputs = model(**batch)
            total_loss += outputs['loss'].item()
            preds = torch.argmax(outputs['logits'], dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(batch['labels'].cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    metrics = compute_metrics(all_preds, all_labels)
    return avg_loss, metrics


In [10]:
def final_evaluation(config):
    model = BERTClassifier.from_pretrained(f"{config.model_dir}/best_model", config.num_labels).to(config.device)
    _, _, test_loader = load_and_process_data(config)
    test_loss, test_metrics = evaluate(model, test_loader, config)

    print("Test Loss:", test_loss)
    print("Test Accuracy:", test_metrics['accuracy'])
    print("Test F1:", test_metrics['f1'])

In [11]:
import wandb

wandb.login(key="ab2237bc8b2d7af0b2465cbd0eb84a45a2ebc941")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mtavdevinit44[0m ([33mtavdevinit44-thinkbiz-technology-pvt[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

Start Training

In [12]:
# Train the model
trained_model, train_losses, val_losses, test_loader = train(config,model_name="bert-base-uncased")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

2025-04-16 06:20:43.554807: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744784443.741557      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744784443.799098      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 1500/1500 [34:18<00:00,  1.37s/it]  
Epoch 2: 100%|██████████| 1500/1500 [34:26<00:00,  1.38s/it]  
Epoch 3: 100%|██████████| 1500/1500 [34:25<00:00,  1.38s/it]  
Epoch 4: 100%|██████████| 1500/1500 [34:23<00:00,  1.38s/it]  


Train Distilbert for comparision

In [None]:
# trained_model1, train_losses1, val_losses1, test_loader1 = train(config,model_name="distilbert-base-uncased")

Final evaluation output

In [13]:
# Final evaluation
final_evaluation(config)

  model.classifier.load_state_dict(torch.load(f"{path}/classifier.pt"))


Test Loss: 0.16443919879579996
Test Accuracy: 0.9426315789473684
Test F1: 0.9426630045475902
