In [46]:
import torch.nn as nn
import torchvision.models as models

class ResNet50(nn.Module):
    def __init__(self, num_classes):
        super(ResNet50, self).__init__()
        self.model = models.resnet50(weights=None)
        self.model.fc = nn.Linear(self.model.fc.in_features, num_classes)

    def forward(self, x):
        return self.model(x)

In [47]:
class Params:
    def __init__(self):
        self.batch_size = 32
        self.name = "resnet_50_sgd1"
        self.workers = 4
        self.lr = 0.1
        self.momentum = 0.9
        self.weight_decay = 1e-4
        self.lr_step_size = 30
        self.lr_gamma = 0.1

    def __repr__(self):
        return str(self.__dict__)

    def __eq__(self, other):
        return self.__dict__ == other.__dict__

In [None]:
from torch_lr_finder import LRFinder
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from datetime import datetime
import os

def find_lr(start_lr=1e-7, end_lr=10, num_iter=100, output_dir='lr_finder_plots'):
    params = Params()
    print(f"Find LR with params: Start_lr: {start_lr}, End_lr: {end_lr}, Num_iter: {num_iter}")
    device = (
        "cuda"
        if torch.cuda.is_available()
        else "mps"
        if torch.backends.mps.is_available()
        else "cpu"
    )

    print(f"Using {device} device")

    training_folder_name = '/content/data/imagenet/train'
    train_transformation = transforms.Compose([
        transforms.ToTensor(),
        transforms.RandomResizedCrop(224, interpolation=transforms.InterpolationMode.BILINEAR, antialias=True),
        transforms.RandomHorizontalFlip(0.5),
        transforms.Normalize(mean=[0.485, 0.485, 0.406], std=[0.229, 0.224, 0.225])
    ])

    train_dataset = torchvision.datasets.ImageFolder(
        root=training_folder_name,
        transform=train_transformation
    )

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=params.batch_size,
        shuffle=True,
        num_workers=params.workers,
        pin_memory=True
    )

    model = ResNet50(num_classes=len(train_dataset.classes)).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=start_lr, momentum=params.momentum, weight_decay=params.weight_decay)

    lr_finder = LRFinder(model, optimizer, criterion, device=device)
    lr_finder.range_test(train_loader, start_lr=start_lr, end_lr=end_lr, num_iter=num_iter, step_mode="exp")

    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Generate filename with timestamp and parameters
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f'lr_finder_{timestamp}_start{start_lr}_end{end_lr}_iter{num_iter}.png'
    filepath = os.path.join(output_dir, filename)

    # Plot and save
    fig, ax = plt.subplots()
    lr_finder.plot(ax=ax)
    plt.title(f'Learning Rate Finder (iter: {num_iter})')
    plt.savefig(filepath, dpi=300, bbox_inches='tight')
    plt.close()

    print(f"Plot saved to: {filepath}")
    lr_finder.reset()

In [49]:
find_lr(start_lr=1e-5, end_lr=10, num_iter=100)

Find LR with params: Start_lr: 1e-05, End_lr: 10, Num_iter: 100
Using cuda device


  0%|          | 0/100 [00:00<?, ?it/s]

Learning rate search finished. See the graph with {finder_name}.plot()
LR suggestion: steepest gradient
Suggested LR: 6.14E-01
Plot saved to: lr_finder_plots/lr_finder_20251016_172027_start1e-05_end10_iter100.png


In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import os
import time
from math import sqrt

# Import PyTorch libraries
import torch
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

## Set Hyperparameters
class Params:
    def __init__(self):
        self.batch_size = 32
        self.name = "resnet_50_sgd1"
        self.workers = 4
        self.lr = 0.1
        self.momentum = 0.9
        self.weight_decay = 1e-4
        self.lr_step_size = 30
        self.lr_gamma = 0.1

    def __repr__(self):
        return str(self.__dict__)

    def __eq__(self, other):
        return self.__dict__ == other.__dict__


#Updating with verbose tqdm train and test functions
from tqdm import tqdm  # For Jupyter-specific progress bar
import logging
import time

# Configure logging for Jupyter
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)
logger = logging.getLogger(__name__)

def train(dataloader, model, loss_fn, optimizer, epoch, writer):
    size = len(dataloader.dataset)
    model.train()
    start0 = time.time()

    # Use tqdm for progress visualization
    progress_bar = tqdm(enumerate(dataloader), total=len(dataloader), desc=f"Epoch {epoch+1}")

    for batch, (X, y) in progress_bar:
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        batch_size = len(X)
        step = epoch * size + (batch + 1) * batch_size

        # Update tqdm description and writer
        if batch % 100 == 0:
            current_loss = loss.item()
            progress_bar.set_postfix({"loss": current_loss, "progress": f"{(batch+1)*batch_size}/{size}"})
            if writer is not None:
                writer.add_scalar('training loss', current_loss, step)
            logger.info(f"Batch {batch+1}: loss={current_loss:.6f}, progress={(batch+1)*batch_size}/{size}")

    epoch_time = time.time() - start0
    logger.info(f"Epoch {epoch+1} completed in {epoch_time:.2f} seconds")


def test(dataloader, model, loss_fn, epoch, writer, train_dataloader, calc_acc5=False):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct, correct_top5 = 0, 0, 0

    # Use tqdm for progress visualization
    progress_bar = tqdm(dataloader, desc=f"Testing Epoch {epoch+1}")

    with torch.no_grad():
        for X, y in progress_bar:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

            if calc_acc5:
                _, pred_top5 = pred.topk(5, 1, largest=True, sorted=True)
                correct_top5 += pred_top5.eq(y.view(-1, 1).expand_as(pred_top5)).sum().item()

    test_loss /= num_batches
    accuracy = 100 * correct / size
    top5_accuracy = 100 * correct_top5 / size if calc_acc5 else None

    step = epoch * len(train_dataloader.dataset)
    if writer is not None:
        writer.add_scalar('test loss', test_loss, step)
        writer.add_scalar('test accuracy', accuracy, step)
        if calc_acc5:
            writer.add_scalar('test accuracy5', top5_accuracy, step)

    logger.info(f"Test Results - Epoch {epoch+1}: Accuracy={accuracy:.2f}%, Avg loss={test_loss:.6f}")
    if calc_acc5:
        logger.info(f"Top-5 Accuracy={top5_accuracy:.2f}%")

if __name__ == "__main__":
    params = Params()
    print(params, params.batch_size)

    training_folder_name = '/content/data/imagenet/train'
    val_folder_name = '/content/data/imagenet/val'

    train_transformation = transforms.Compose([
            transforms.ToTensor(),
            transforms.RandomResizedCrop(224, interpolation=transforms.InterpolationMode.BILINEAR, antialias=True),
            transforms.RandomHorizontalFlip(0.5),
            # Normalize the pixel values (in R, G, and B channels)
            transforms.Normalize(mean=[0.485, 0.485, 0.406], std=[0.229, 0.224, 0.225])
        ])

    train_dataset = torchvision.datasets.ImageFolder(
        root=training_folder_name,
        transform=train_transformation
    )
    train_sampler = torch.utils.data.RandomSampler(train_dataset)

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=params.batch_size,
        sampler=train_sampler,
        num_workers = params.workers,
        pin_memory=True,
    )

    val_transformation = transforms.Compose([
            transforms.ToTensor(),
            transforms.Resize(size=256, antialias=True),
            transforms.CenterCrop(224),
            # Normalize the pixel values (in R, G, and B channels)
            transforms.Normalize(mean=[0.485, 0.485, 0.406], std=[0.229, 0.224, 0.225])
        ])
    val_dataset = torchvision.datasets.ImageFolder(
        root=val_folder_name,
        transform=val_transformation
    )

    val_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=32,
        num_workers=params.workers,
        shuffle=False,
        pin_memory=True
    )

    # device
    print("Libraries imported - ready to use PyTorch", torch.__version__)
    device = (
        "cuda"
        if torch.cuda.is_available()
        else "mps"
        if torch.backends.mps.is_available()
        else "cpu"
    )

    print(f"Using {device} device")

    ## Testing with pre-trained model : only to be done once
    ## testing a pretrained model to validate correctness of our dataset, transform and metrics code
    # pretrained_model = torchvision.models.resnet18(weights='ResNet18_Weights.DEFAULT').to(device)
    # start = time.time()
    # loss_fn = nn.CrossEntropyLoss()
    # test(val_loader, pretrained_model, loss_fn, epoch=0, writer=None, train_dataloader=train_loader, calc_acc5=True)
    # print("Elapsed: ", time.time() - start)

    # resume training options
    resume_training = True

    num_classes = len(train_dataset.classes)
    model = ResNet50(num_classes=num_classes)
    model.to(device)

    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=params.lr, momentum=params.momentum, weight_decay=params.weight_decay)

    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=params.lr_step_size, gamma=params.lr_gamma)

    ## Current State of Training
    start_epoch = 0
    checkpoint_path = os.path.join("checkpoints", params.name, f"checkpoint.pth")
    print(checkpoint_path)
    if resume_training and os.path.exists(checkpoint_path):
        print("Resuming training from checkpoint")
        print(checkpoint_path)
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint["model"])
        start_epoch = checkpoint["epoch"] + 1
        optimizer.load_state_dict(checkpoint["optimizer"])
        lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
        assert params == checkpoint["params"]

    from torch.utils.tensorboard import SummaryWriter
    from pathlib import Path
    Path(os.path.join("checkpoints", params.name)).mkdir(parents=True, exist_ok=True)
    writer = SummaryWriter('runs/' + params.name)
    test(val_loader, model, loss_fn, epoch=0, writer=writer, train_dataloader=train_loader, calc_acc5=True)
    print("Starting training")
    for epoch in range(start_epoch, 10):
        print(f"Epoch {epoch}")
        train(train_loader, model, loss_fn, optimizer, epoch=epoch, writer=writer)
        checkpoint = {
            "model": model.state_dict(),
            "optimizer": optimizer.state_dict(),
            "lr_scheduler": lr_scheduler.state_dict(),
            "epoch": epoch,
            "params": params
        }
        torch.save(checkpoint, os.path.join("checkpoints", params.name, f"model_{epoch}.pth"))
        torch.save(checkpoint, os.path.join("checkpoints", params.name, f"checkpoint.pth"))
        lr_scheduler.step()
        test(val_loader, model, loss_fn, epoch + 1, writer, train_dataloader=train_loader, calc_acc5=True)

{'batch_size': 32, 'name': 'resnet_50_sgd1', 'workers': 4, 'lr': 0.1, 'momentum': 0.9, 'weight_decay': 0.0001, 'lr_step_size': 30, 'lr_gamma': 0.1} 32
Libraries imported - ready to use PyTorch 2.8.0+cu126
Using cuda device
checkpoints/resnet_50_sgd1/checkpoint.pth


Testing Epoch 1: 100%|██████████| 123/123 [00:36<00:00,  3.33it/s]
2025-10-16 17:24:57,093 - INFO - Test Results - Epoch 1: Accuracy=0.10%, Avg loss=42.227212
2025-10-16 17:24:57,094 - INFO - Top-5 Accuracy=0.48%


Starting training
Epoch 0


Epoch 1:   0%|          | 0/1086 [00:01<?, ?it/s, loss=7, progress=32/34745]2025-10-16 17:24:58,440 - INFO - Batch 1: loss=6.999109, progress=32/34745
Epoch 1:   9%|▉         | 100/1086 [00:33<05:35,  2.94it/s, loss=6.89, progress=3232/34745]2025-10-16 17:25:30,946 - INFO - Batch 101: loss=6.892321, progress=3232/34745
Epoch 1:  18%|█▊        | 200/1086 [01:06<04:53,  3.02it/s, loss=6.87, progress=6432/34745]2025-10-16 17:26:04,050 - INFO - Batch 201: loss=6.865868, progress=6432/34745
Epoch 1:  28%|██▊       | 300/1086 [01:40<04:33,  2.88it/s, loss=6.87, progress=9632/34745]2025-10-16 17:26:37,504 - INFO - Batch 301: loss=6.869095, progress=9632/34745
Epoch 1:  37%|███▋      | 400/1086 [02:14<03:53,  2.94it/s, loss=6.87, progress=12832/34745]2025-10-16 17:27:11,384 - INFO - Batch 401: loss=6.865733, progress=12832/34745
Epoch 1:  46%|████▌     | 500/1086 [02:48<03:15,  3.00it/s, loss=6.85, progress=16032/34745]2025-10-16 17:27:45,588 - INFO - Batch 501: loss=6.850127, progress=16032/3

Epoch 1


Epoch 2:   0%|          | 0/1086 [00:01<?, ?it/s, loss=6.73, progress=32/34745]2025-10-16 17:31:50,370 - INFO - Batch 1: loss=6.727271, progress=32/34745
Epoch 2:   9%|▉         | 100/1086 [00:36<05:38,  2.91it/s, loss=6.94, progress=3232/34745]2025-10-16 17:32:25,411 - INFO - Batch 101: loss=6.936485, progress=3232/34745
Epoch 2:  18%|█▊        | 200/1086 [01:11<05:08,  2.87it/s, loss=6.8, progress=6432/34745] 2025-10-16 17:33:00,285 - INFO - Batch 201: loss=6.803489, progress=6432/34745
Epoch 2:  28%|██▊       | 300/1086 [01:46<04:33,  2.87it/s, loss=6.92, progress=9632/34745]2025-10-16 17:33:35,387 - INFO - Batch 301: loss=6.921869, progress=9632/34745
Epoch 2:  37%|███▋      | 400/1086 [02:21<03:56,  2.90it/s, loss=6.78, progress=12832/34745]2025-10-16 17:34:10,580 - INFO - Batch 401: loss=6.779819, progress=12832/34745
Epoch 2:  46%|████▌     | 500/1086 [02:56<03:31,  2.77it/s, loss=6.77, progress=16032/34745]2025-10-16 17:34:45,698 - INFO - Batch 501: loss=6.768330, progress=1603

KeyboardInterrupt: 