In [46]:
import torch.nn as nn
import torchvision.models as models

class ResNet50(nn.Module):
    def __init__(self, num_classes):
        super(ResNet50, self).__init__()
        self.model = models.resnet50(weights=None)
        self.model.fc = nn.Linear(self.model.fc.in_features, num_classes)

    def forward(self, x):
        return self.model(x)

In [44]:
import os
import shutil

source_path = "/kaggle/input/imagenetmini-1000"
destination_path = "/content/imagenetmini-1000"

# Create the destination directory if it doesn't exist
os.makedirs(destination_path, exist_ok=True)

# Copy the contents of the source directory to the destination directory
# Use rsync for potentially faster copying of large directories
if os.path.exists(source_path):
  !rsync -a "{source_path}/" "{destination_path}/"
  print(f"Dataset copied from {source_path} to {destination_path}")
else:
  print(f"Source directory {source_path} does not exist.")

Dataset copied from /kaggle/input/imagenetmini-1000 to /content/imagenetmini-1000


In [47]:
class Params:
    def __init__(self):
        self.batch_size = 32
        self.name = "resnet_50_sgd1"
        self.workers = 4
        self.lr = 0.1
        self.momentum = 0.9
        self.weight_decay = 1e-4
        self.lr_step_size = 30
        self.lr_gamma = 0.1

    def __repr__(self):
        return str(self.__dict__)

    def __eq__(self, other):
        return self.__dict__ == other.__dict__

In [48]:
from torch_lr_finder import LRFinder
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from datetime import datetime
import os

def find_lr(start_lr=1e-7, end_lr=10, num_iter=100, output_dir='lr_finder_plots'):
    params = Params()
    print(f"Find LR with params: Start_lr: {start_lr}, End_lr: {end_lr}, Num_iter: {num_iter}")
    device = (
        "cuda"
        if torch.cuda.is_available()
        else "mps"
        if torch.backends.mps.is_available()
        else "cpu"
    )

    print(f"Using {device} device")

    training_folder_name = '/content/imagenetmini-1000/imagenet-mini/train'
    train_transformation = transforms.Compose([
        transforms.ToTensor(),
        transforms.RandomResizedCrop(224, interpolation=transforms.InterpolationMode.BILINEAR, antialias=True),
        transforms.RandomHorizontalFlip(0.5),
        transforms.Normalize(mean=[0.485, 0.485, 0.406], std=[0.229, 0.224, 0.225])
    ])

    train_dataset = torchvision.datasets.ImageFolder(
        root=training_folder_name,
        transform=train_transformation
    )

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=params.batch_size,
        shuffle=True,
        num_workers=params.workers,
        pin_memory=True
    )

    model = ResNet50(num_classes=len(train_dataset.classes)).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=start_lr, momentum=params.momentum, weight_decay=params.weight_decay)

    lr_finder = LRFinder(model, optimizer, criterion, device=device)
    lr_finder.range_test(train_loader, start_lr=start_lr, end_lr=end_lr, num_iter=num_iter, step_mode="exp")

    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Generate filename with timestamp and parameters
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f'lr_finder_{timestamp}_start{start_lr}_end{end_lr}_iter{num_iter}.png'
    filepath = os.path.join(output_dir, filename)

    # Plot and save
    fig, ax = plt.subplots()
    lr_finder.plot(ax=ax)
    plt.title(f'Learning Rate Finder (iter: {num_iter})')
    plt.savefig(filepath, dpi=300, bbox_inches='tight')
    plt.close()

    print(f"Plot saved to: {filepath}")
    lr_finder.reset()

In [49]:
find_lr(start_lr=1e-5, end_lr=10, num_iter=100)

Find LR with params: Start_lr: 1e-05, End_lr: 10, Num_iter: 100
Using cuda device


  0%|          | 0/100 [00:00<?, ?it/s]

Learning rate search finished. See the graph with {finder_name}.plot()
LR suggestion: steepest gradient
Suggested LR: 6.14E-01
Plot saved to: lr_finder_plots/lr_finder_20251016_172027_start1e-05_end10_iter100.png


In [51]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import os
import time
from math import sqrt

# Import PyTorch libraries
import torch
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

## Set Hyperparameters
class Params:
    def __init__(self):
        self.batch_size = 32
        self.name = "resnet_50_sgd1"
        self.workers = 4
        self.lr = 0.1
        self.momentum = 0.9
        self.weight_decay = 1e-4
        self.lr_step_size = 30
        self.lr_gamma = 0.1

    def __repr__(self):
        return str(self.__dict__)

    def __eq__(self, other):
        return self.__dict__ == other.__dict__


#Updating with verbose tqdm train and test functions
from tqdm import tqdm  # For Jupyter-specific progress bar
import logging
import time

# Configure logging for Jupyter
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)
logger = logging.getLogger(__name__)

def train(dataloader, model, loss_fn, optimizer, epoch, writer):
    size = len(dataloader.dataset)
    model.train()
    start0 = time.time()

    # Use tqdm for progress visualization
    progress_bar = tqdm(enumerate(dataloader), total=len(dataloader), desc=f"Epoch {epoch+1}")

    for batch, (X, y) in progress_bar:
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        batch_size = len(X)
        step = epoch * size + (batch + 1) * batch_size

        # Update tqdm description and writer
        if batch % 100 == 0:
            current_loss = loss.item()
            progress_bar.set_postfix({"loss": current_loss, "progress": f"{(batch+1)*batch_size}/{size}"})
            if writer is not None:
                writer.add_scalar('training loss', current_loss, step)
            logger.info(f"Batch {batch+1}: loss={current_loss:.6f}, progress={(batch+1)*batch_size}/{size}")

    epoch_time = time.time() - start0
    logger.info(f"Epoch {epoch+1} completed in {epoch_time:.2f} seconds")


def test(dataloader, model, loss_fn, epoch, writer, train_dataloader, calc_acc5=False):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct, correct_top5 = 0, 0, 0

    # Use tqdm for progress visualization
    progress_bar = tqdm(dataloader, desc=f"Testing Epoch {epoch+1}")

    with torch.no_grad():
        for X, y in progress_bar:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

            if calc_acc5:
                _, pred_top5 = pred.topk(5, 1, largest=True, sorted=True)
                correct_top5 += pred_top5.eq(y.view(-1, 1).expand_as(pred_top5)).sum().item()

    test_loss /= num_batches
    accuracy = 100 * correct / size
    top5_accuracy = 100 * correct_top5 / size if calc_acc5 else None

    step = epoch * len(train_dataloader.dataset)
    if writer is not None:
        writer.add_scalar('test loss', test_loss, step)
        writer.add_scalar('test accuracy', accuracy, step)
        if calc_acc5:
            writer.add_scalar('test accuracy5', top5_accuracy, step)

    logger.info(f"Test Results - Epoch {epoch+1}: Accuracy={accuracy:.2f}%, Avg loss={test_loss:.6f}")
    if calc_acc5:
        logger.info(f"Top-5 Accuracy={top5_accuracy:.2f}%")

if __name__ == "__main__":
    params = Params()
    print(params, params.batch_size)

    training_folder_name = '/content/imagenetmini-1000/imagenet-mini/train'
    val_folder_name = '/content/imagenetmini-1000/imagenet-mini/val'

    train_transformation = transforms.Compose([
            transforms.ToTensor(),
            transforms.RandomResizedCrop(224, interpolation=transforms.InterpolationMode.BILINEAR, antialias=True),
            transforms.RandomHorizontalFlip(0.5),
            # Normalize the pixel values (in R, G, and B channels)
            transforms.Normalize(mean=[0.485, 0.485, 0.406], std=[0.229, 0.224, 0.225])
        ])

    train_dataset = torchvision.datasets.ImageFolder(
        root=training_folder_name,
        transform=train_transformation
    )
    train_sampler = torch.utils.data.RandomSampler(train_dataset)

    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=params.batch_size,
        sampler=train_sampler,
        num_workers = params.workers,
        pin_memory=True,
    )

    val_transformation = transforms.Compose([
            transforms.ToTensor(),
            transforms.Resize(size=256, antialias=True),
            transforms.CenterCrop(224),
            # Normalize the pixel values (in R, G, and B channels)
            transforms.Normalize(mean=[0.485, 0.485, 0.406], std=[0.229, 0.224, 0.225])
        ])
    val_dataset = torchvision.datasets.ImageFolder(
        root=val_folder_name,
        transform=val_transformation
    )

    val_loader = torch.utils.data.DataLoader(
        val_dataset,
        batch_size=32,
        num_workers=params.workers,
        shuffle=False,
        pin_memory=True
    )

    # device
    print("Libraries imported - ready to use PyTorch", torch.__version__)
    device = (
        "cuda"
        if torch.cuda.is_available()
        else "mps"
        if torch.backends.mps.is_available()
        else "cpu"
    )

    print(f"Using {device} device")

    ## Testing with pre-trained model : only to be done once
    ## testing a pretrained model to validate correctness of our dataset, transform and metrics code
    # pretrained_model = torchvision.models.resnet18(weights='ResNet18_Weights.DEFAULT').to(device)
    # start = time.time()
    # loss_fn = nn.CrossEntropyLoss()
    # test(val_loader, pretrained_model, loss_fn, epoch=0, writer=None, train_dataloader=train_loader, calc_acc5=True)
    # print("Elapsed: ", time.time() - start)

    # resume training options
    resume_training = True

    num_classes = len(train_dataset.classes)
    model = ResNet50(num_classes=num_classes)
    model.to(device)

    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=params.lr, momentum=params.momentum, weight_decay=params.weight_decay)

    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=params.lr_step_size, gamma=params.lr_gamma)

    ## Current State of Training
    start_epoch = 0
    checkpoint_path = os.path.join("checkpoints", params.name, f"checkpoint.pth")
    print(checkpoint_path)
    if resume_training and os.path.exists(checkpoint_path):
        print("Resuming training from checkpoint")
        print(checkpoint_path)
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint["model"])
        start_epoch = checkpoint["epoch"] + 1
        optimizer.load_state_dict(checkpoint["optimizer"])
        lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
        assert params == checkpoint["params"]

    from torch.utils.tensorboard import SummaryWriter
    from pathlib import Path
    Path(os.path.join("checkpoints", params.name)).mkdir(parents=True, exist_ok=True)
    writer = SummaryWriter('runs/' + params.name)
    test(val_loader, model, loss_fn, epoch=0, writer=writer, train_dataloader=train_loader, calc_acc5=True)
    print("Starting training")
    for epoch in range(start_epoch, 10):
        print(f"Epoch {epoch}")
        train(train_loader, model, loss_fn, optimizer, epoch=epoch, writer=writer)
        checkpoint = {
            "model": model.state_dict(),
            "optimizer": optimizer.state_dict(),
            "lr_scheduler": lr_scheduler.state_dict(),
            "epoch": epoch,
            "params": params
        }
        torch.save(checkpoint, os.path.join("checkpoints", params.name, f"model_{epoch}.pth"))
        torch.save(checkpoint, os.path.join("checkpoints", params.name, f"checkpoint.pth"))
        lr_scheduler.step()
        test(val_loader, model, loss_fn, epoch + 1, writer, train_dataloader=train_loader, calc_acc5=True)

{'batch_size': 32, 'name': 'resnet_50_sgd1', 'workers': 4, 'lr': 0.1, 'momentum': 0.9, 'weight_decay': 0.0001, 'lr_step_size': 30, 'lr_gamma': 0.1} 32
Libraries imported - ready to use PyTorch 2.8.0+cu126
Using cuda device
checkpoints/resnet_50_sgd1/checkpoint.pth


Testing Epoch 1: 100%|██████████| 123/123 [00:36<00:00,  3.33it/s]
2025-10-16 17:24:57,093 - INFO - Test Results - Epoch 1: Accuracy=0.10%, Avg loss=42.227212
2025-10-16 17:24:57,094 - INFO - Top-5 Accuracy=0.48%


Starting training
Epoch 0


Epoch 1:   0%|          | 0/1086 [00:01<?, ?it/s, loss=7, progress=32/34745]2025-10-16 17:24:58,440 - INFO - Batch 1: loss=6.999109, progress=32/34745
Epoch 1:   9%|▉         | 100/1086 [00:33<05:35,  2.94it/s, loss=6.89, progress=3232/34745]2025-10-16 17:25:30,946 - INFO - Batch 101: loss=6.892321, progress=3232/34745
Epoch 1:  18%|█▊        | 200/1086 [01:06<04:53,  3.02it/s, loss=6.87, progress=6432/34745]2025-10-16 17:26:04,050 - INFO - Batch 201: loss=6.865868, progress=6432/34745
Epoch 1:  28%|██▊       | 300/1086 [01:40<04:33,  2.88it/s, loss=6.87, progress=9632/34745]2025-10-16 17:26:37,504 - INFO - Batch 301: loss=6.869095, progress=9632/34745
Epoch 1:  37%|███▋      | 400/1086 [02:14<03:53,  2.94it/s, loss=6.87, progress=12832/34745]2025-10-16 17:27:11,384 - INFO - Batch 401: loss=6.865733, progress=12832/34745
Epoch 1:  46%|████▌     | 500/1086 [02:48<03:15,  3.00it/s, loss=6.85, progress=16032/34745]2025-10-16 17:27:45,588 - INFO - Batch 501: loss=6.850127, progress=16032/3

Epoch 1


Epoch 2:   0%|          | 0/1086 [00:01<?, ?it/s, loss=6.73, progress=32/34745]2025-10-16 17:31:50,370 - INFO - Batch 1: loss=6.727271, progress=32/34745
Epoch 2:   9%|▉         | 100/1086 [00:36<05:38,  2.91it/s, loss=6.94, progress=3232/34745]2025-10-16 17:32:25,411 - INFO - Batch 101: loss=6.936485, progress=3232/34745
Epoch 2:  18%|█▊        | 200/1086 [01:11<05:08,  2.87it/s, loss=6.8, progress=6432/34745] 2025-10-16 17:33:00,285 - INFO - Batch 201: loss=6.803489, progress=6432/34745
Epoch 2:  28%|██▊       | 300/1086 [01:46<04:33,  2.87it/s, loss=6.92, progress=9632/34745]2025-10-16 17:33:35,387 - INFO - Batch 301: loss=6.921869, progress=9632/34745
Epoch 2:  37%|███▋      | 400/1086 [02:21<03:56,  2.90it/s, loss=6.78, progress=12832/34745]2025-10-16 17:34:10,580 - INFO - Batch 401: loss=6.779819, progress=12832/34745
Epoch 2:  46%|████▌     | 500/1086 [02:56<03:31,  2.77it/s, loss=6.77, progress=16032/34745]2025-10-16 17:34:45,698 - INFO - Batch 501: loss=6.768330, progress=1603

KeyboardInterrupt: 

In [36]:
import torch, random, numpy as np
seed = 42
random.seed(seed); np.random.seed(seed); torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [37]:
from torchvision import transforms

IMG_SIZE = 84  # canonical for mini-ImageNet

train_transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),   # <- make all images same HxW
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

test_transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),   # deterministic for eval
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

class MiniImageNetDataset(Dataset):
    def __init__(self, hf_dataset, transform=None):
        self.dataset = hf_dataset
        self.transform = transform

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        image = item["image"]
        label = item["label"]
        # ensure RGB for consistent channel count
        image = image.convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image, label

batch_size = 64

ds = load_dataset("timm/mini-imagenet")

trainset = MiniImageNetDataset(ds["train"], transform=train_transform)
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=False, num_workers=0)

testset = MiniImageNetDataset(ds["validation"], transform=test_transform)
testloader = DataLoader(testset, batch_size=batch_size*2, shuffle=False, num_workers=0)

In [38]:
from torch_lr_finder import LRFinder
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from datetime import datetime
import os
from datasets import load_dataset

from torchvision import transforms

def find_lr(start_lr=1e-7, end_lr=10, num_iter=100, output_dir='lr_finder_plots'):
    params = Params()
    print(f"Find LR with params: Start_lr: {start_lr}, End_lr: {end_lr}, Num_iter: {num_iter}")
    device = (
        "cuda"
        if torch.cuda.is_available()
        else "mps"
        if torch.backends.mps.is_available()
        else "cpu"
    )

    print(f"Using {device} device")


    model = ResNet50(num_classes=100).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=start_lr, momentum=params.momentum, weight_decay=params.weight_decay)

    lr_finder = LRFinder(model, optimizer, criterion, device=device)
    lr_finder.range_test(trainloader, start_lr=start_lr, end_lr=end_lr, num_iter=num_iter, step_mode="exp")

    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Generate filename with timestamp and parameters
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f'lr_finder_{timestamp}_start{start_lr}_end{end_lr}_iter{num_iter}.png'
    filepath = os.path.join(output_dir, filename)

    # Plot and save
    fig, ax = plt.subplots()
    lr_finder.plot(ax=ax)
    plt.title(f'Learning Rate Finder (iter: {num_iter})')
    plt.savefig(filepath, dpi=300, bbox_inches='tight')
    plt.close()

    print(f"Plot saved to: {filepath}")
    lr_finder.reset()

In [40]:
find_lr(start_lr=1e-5, end_lr=10, num_iter=100)

Find LR with params: Start_lr: 1e-05, End_lr: 10, Num_iter: 100
Using cuda device


  0%|          | 0/100 [00:00<?, ?it/s]

Learning rate search finished. See the graph with {finder_name}.plot()
LR suggestion: steepest gradient
Suggested LR: 4.98E-02
Plot saved to: lr_finder_plots/lr_finder_20251016_165754_start1e-05_end10_iter100.png


In [42]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import os
import time
from math import sqrt

# Import PyTorch libraries
import torch
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

## Set Hyperparameters
class Params:
    def __init__(self):
        self.batch_size = 32
        self.name = "resnet_50_sgd1"
        self.workers = 4
        self.lr = 0.0498
        self.momentum = 0.9
        self.weight_decay = 1e-4
        self.lr_step_size = 30
        self.lr_gamma = 0.1

    def __repr__(self):
        return str(self.__dict__)

    def __eq__(self, other):
        return self.__dict__ == other.__dict__


#Updating with verbose tqdm train and test functions
from tqdm import tqdm  # For Jupyter-specific progress bar
import logging
import time

# Configure logging for Jupyter
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)
logger = logging.getLogger(__name__)

def train(dataloader, model, loss_fn, optimizer, epoch, writer):
    size = len(dataloader.dataset)
    model.train()
    start0 = time.time()

    # Use tqdm for progress visualization
    progress_bar = tqdm(enumerate(dataloader), total=len(dataloader), desc=f"Epoch {epoch+1}")

    for batch, (X, y) in progress_bar:
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        batch_size = len(X)
        step = epoch * size + (batch + 1) * batch_size

        # Update tqdm description and writer
        if batch % 100 == 0:
            current_loss = loss.item()
            progress_bar.set_postfix({"loss": current_loss, "progress": f"{(batch+1)*batch_size}/{size}"})
            if writer is not None:
                writer.add_scalar('training loss', current_loss, step)
            logger.info(f"Batch {batch+1}: loss={current_loss:.6f}, progress={(batch+1)*batch_size}/{size}")

    epoch_time = time.time() - start0
    logger.info(f"Epoch {epoch+1} completed in {epoch_time:.2f} seconds")


def test(dataloader, model, loss_fn, epoch, writer, train_dataloader, calc_acc5=False):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct, correct_top5 = 0, 0, 0

    # Use tqdm for progress visualization
    progress_bar = tqdm(dataloader, desc=f"Testing Epoch {epoch+1}")

    with torch.no_grad():
        for X, y in progress_bar:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

            if calc_acc5:
                _, pred_top5 = pred.topk(5, 1, largest=True, sorted=True)
                correct_top5 += pred_top5.eq(y.view(-1, 1).expand_as(pred_top5)).sum().item()

    test_loss /= num_batches
    accuracy = 100 * correct / size
    top5_accuracy = 100 * correct_top5 / size if calc_acc5 else None

    step = epoch * len(train_dataloader.dataset)
    if writer is not None:
        writer.add_scalar('test loss', test_loss, step)
        writer.add_scalar('test accuracy', accuracy, step)
        if calc_acc5:
            writer.add_scalar('test accuracy5', top5_accuracy, step)

    logger.info(f"Test Results - Epoch {epoch+1}: Accuracy={accuracy:.2f}%, Avg loss={test_loss:.6f}")
    if calc_acc5:
        logger.info(f"Top-5 Accuracy={top5_accuracy:.2f}%")

if __name__ == "__main__":
    params = Params()
    print(params, params.batch_size)

    # device
    print("Libraries imported - ready to use PyTorch", torch.__version__)
    device = (
        "cuda"
        if torch.cuda.is_available()
        else "mps"
        if torch.backends.mps.is_available()
        else "cpu"
    )

    print(f"Using {device} device")

    ## Testing with pre-trained model : only to be done once
    ## testing a pretrained model to validate correctness of our dataset, transform and metrics code
    # pretrained_model = torchvision.models.resnet18(weights='ResNet18_Weights.DEFAULT').to(device)
    # start = time.time()
    # loss_fn = nn.CrossEntropyLoss()
    # test(val_loader, pretrained_model, loss_fn, epoch=0, writer=None, train_dataloader=train_loader, calc_acc5=True)
    # print("Elapsed: ", time.time() - start)

    # resume training options
    resume_training = True

    num_classes = 100
    model = ResNet50(num_classes=num_classes)
    model.to(device)

    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(),
                                lr=params.lr, momentum=params.momentum, weight_decay=params.weight_decay)

    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=params.lr_step_size, gamma=params.lr_gamma)

    ## Current State of Training
    start_epoch = 0
    checkpoint_path = os.path.join("checkpoints", params.name, f"checkpoint.pth")
    print(checkpoint_path)
    if resume_training and os.path.exists(checkpoint_path):
        print("Resuming training from checkpoint")
        print(checkpoint_path)
        checkpoint = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint["model"])
        start_epoch = checkpoint["epoch"] + 1
        optimizer.load_state_dict(checkpoint["optimizer"])
        lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
        assert params == checkpoint["params"]

    from torch.utils.tensorboard import SummaryWriter
    from pathlib import Path
    Path(os.path.join("checkpoints", params.name)).mkdir(parents=True, exist_ok=True)
    writer = SummaryWriter('runs/' + params.name)
    test(testloader, model, loss_fn, epoch=0, writer=writer, train_dataloader=trainloader, calc_acc5=True)
    print("Starting training")
    for epoch in range(start_epoch, 10):
        print(f"Epoch {epoch}")
        train(trainloader, model, loss_fn, optimizer, epoch=epoch, writer=writer)
        checkpoint = {
            "model": model.state_dict(),
            "optimizer": optimizer.state_dict(),
            "lr_scheduler": lr_scheduler.state_dict(),
            "epoch": epoch,
            "params": params
        }
        torch.save(checkpoint, os.path.join("checkpoints", params.name, f"model_{epoch}.pth"))
        torch.save(checkpoint, os.path.join("checkpoints", params.name, f"checkpoint.pth"))
        lr_scheduler.step()
        test(testloader, model, loss_fn, epoch + 1, writer, train_dataloader=trainloader, calc_acc5=True)

{'batch_size': 32, 'name': 'resnet_50_sgd1', 'workers': 4, 'lr': 0.0498, 'momentum': 0.9, 'weight_decay': 0.0001, 'lr_step_size': 30, 'lr_gamma': 0.1} 32
Libraries imported - ready to use PyTorch 2.8.0+cu126
Using cuda device
checkpoints/resnet_50_sgd1/checkpoint.pth


Testing Epoch 1: 100%|██████████| 79/79 [00:50<00:00,  1.56it/s]
2025-10-16 17:06:44,993 - INFO - Test Results - Epoch 1: Accuracy=1.00%, Avg loss=12.525382
2025-10-16 17:06:44,994 - INFO - Top-5 Accuracy=4.95%


Starting training
Epoch 0


Epoch 1:   0%|          | 0/782 [00:00<?, ?it/s, loss=5.1, progress=64/50000]2025-10-16 17:06:45,427 - INFO - Batch 1: loss=5.103751, progress=64/50000
Epoch 1:   4%|▍         | 31/782 [00:09<03:58,  3.15it/s, loss=5.1, progress=64/50000]


KeyboardInterrupt: 

In [31]:
find_lr()

Find LR with params: Start_lr: 1e-07, End_lr: 10, Num_iter: 100
Using cuda device


  0%|          | 0/100 [00:00<?, ?it/s]

Learning rate search finished. See the graph with {finder_name}.plot()
LR suggestion: steepest gradient
Suggested LR: 3.94E+00
Plot saved to: lr_finder_plots/lr_finder_20251016_163038_start1e-07_end10_iter100.png


In [32]:
find_lr(start_lr=1e-2, end_lr=10, num_iter=100)

Find LR with params: Start_lr: 0.01, End_lr: 10, Num_iter: 100
Using cuda device


  0%|          | 0/100 [00:00<?, ?it/s]

Learning rate search finished. See the graph with {finder_name}.plot()
LR suggestion: steepest gradient
Suggested LR: 1.75E-01
Plot saved to: lr_finder_plots/lr_finder_20251016_164501_start0.01_end10_iter100.png


In [1]:
from torch_lr_finder import LRFinder

  from tqdm.autonotebook import tqdm


In [None]:

model = ...
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.1, weight_decay=1e-2)
lr_finder = LRFinder(model, optimizer, criterion, device="cuda")
lr_finder.range_test(trainloader, val_loader=val_loader, end_lr=1, num_iter=100, step_mode="linear")
lr_finder.plot(log_lr=False)
lr_finder.reset()

In [1]:
!pip install torch-lr-finder -v --global-option="apex"

Using pip 24.1.2 from /usr/local/lib/python3.12/dist-packages/pip (python 3.12)
[33mDEPRECATION: --build-option and --global-option are deprecated. pip 24.2 will enforce this behaviour change. A possible replacement is to use --config-settings. Discussion can be found at https://github.com/pypa/pip/issues/11859[0m[33m
[0mCollecting torch-lr-finder
  Downloading torch_lr_finder-0.2.2.tar.gz (17 kB)
  Running command python setup.py egg_info
  running egg_info
  creating /tmp/pip-pip-egg-info-ekda8hz2/torch_lr_finder.egg-info
  writing /tmp/pip-pip-egg-info-ekda8hz2/torch_lr_finder.egg-info/PKG-INFO
  writing dependency_links to /tmp/pip-pip-egg-info-ekda8hz2/torch_lr_finder.egg-info/dependency_links.txt
  writing requirements to /tmp/pip-pip-egg-info-ekda8hz2/torch_lr_finder.egg-info/requires.txt
  writing top-level names to /tmp/pip-pip-egg-info-ekda8hz2/torch_lr_finder.egg-info/top_level.txt
  writing manifest file '/tmp/pip-pip-egg-info-ekda8hz2/torch_lr_finder.egg-info/SOURCES.t

In [2]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import os
import time

# Import PyTorch libraries
import torch
import torchvision
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

print("Libraries imported - ready to use PyTorch", torch.__version__)

def show_image(image, label):
    image = image.permute(1, 2, 0)
    plt.imshow(image.squeeze())
    plt.title(f'Label: {label}')
    plt.show()

Libraries imported - ready to use PyTorch 2.8.0+cu126


In [3]:
# device
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)

In [4]:
print(f"Using {device} device")

# resume training options
resume_training = True

class Params:
    def __init__(self):
        self.batch_size = 16
        self.name = "resnet_152_sgd1"
        self.workers = 4
        self.lr = 0.1
        self.momentum = 0.9
        self.weight_decay = 1e-4
        self.lr_step_size = 30
        self.lr_gamma = 0.1

    def __repr__(self):
        return str(self.__dict__)

    def __eq__(self, other):
        return self.__dict__ == other.__dict__

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
params = Params()
params, params.batch_size

Using cuda device


({'batch_size': 16, 'name': 'resnet_152_sgd1', 'workers': 4, 'lr': 0.1, 'momentum': 0.9, 'weight_decay': 0.0001, 'lr_step_size': 30, 'lr_gamma': 0.1},
 16)

In [5]:
from datasets import load_dataset

ds = load_dataset("timm/mini-imagenet")

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00013.parquet:   0%|          | 0.00/485M [00:00<?, ?B/s]

data/train-00001-of-00013.parquet:   0%|          | 0.00/361M [00:00<?, ?B/s]

data/train-00002-of-00013.parquet:   0%|          | 0.00/410M [00:00<?, ?B/s]

data/train-00003-of-00013.parquet:   0%|          | 0.00/452M [00:00<?, ?B/s]

data/train-00004-of-00013.parquet:   0%|          | 0.00/395M [00:00<?, ?B/s]

data/train-00005-of-00013.parquet:   0%|          | 0.00/481M [00:00<?, ?B/s]

data/train-00006-of-00013.parquet:   0%|          | 0.00/412M [00:00<?, ?B/s]

data/train-00007-of-00013.parquet:   0%|          | 0.00/368M [00:00<?, ?B/s]

data/train-00008-of-00013.parquet:   0%|          | 0.00/396M [00:00<?, ?B/s]

data/train-00009-of-00013.parquet:   0%|          | 0.00/447M [00:00<?, ?B/s]

data/train-00010-of-00013.parquet:   0%|          | 0.00/495M [00:00<?, ?B/s]

data/train-00011-of-00013.parquet:   0%|          | 0.00/438M [00:00<?, ?B/s]

data/train-00012-of-00013.parquet:   0%|          | 0.00/505M [00:00<?, ?B/s]

data/validation-00000-of-00003.parquet:   0%|          | 0.00/376M [00:00<?, ?B/s]

data/validation-00001-of-00003.parquet:   0%|          | 0.00/344M [00:00<?, ?B/s]

data/validation-00002-of-00003.parquet:   0%|          | 0.00/405M [00:00<?, ?B/s]

data/test-00000-of-00002.parquet:   0%|          | 0.00/326M [00:00<?, ?B/s]

data/test-00001-of-00002.parquet:   0%|          | 0.00/336M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [6]:
ds

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 50000
    })
    validation: Dataset({
        features: ['image', 'label'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 5000
    })
})

In [7]:
from torchvision import transforms

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

In [8]:
from torch.utils.data import Dataset, DataLoader
from PIL import Image

class MiniImageNetDataset(Dataset):
    def __init__(self, hf_dataset, transform=None):
        self.dataset = hf_dataset
        self.transform = transform

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        image = item["image"]
        label = item["label"]

        # Convert PIL image or np.array to tensor
        if self.transform:
            image = self.transform(image)

        return image, label


In [9]:
batch_size = 64  # or whatever you want

trainset = MiniImageNetDataset(ds["train"], transform=transform)
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=0)

testset = MiniImageNetDataset(ds["validation"], transform=transform)
testloader = DataLoader(testset, batch_size=batch_size * 2, shuffle=False, num_workers=0)

In [10]:
from torchvision import transforms

IMG_SIZE = 84  # canonical for mini-ImageNet

train_transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),   # <- make all images same HxW
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

test_transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),   # deterministic for eval
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

class MiniImageNetDataset(Dataset):
    def __init__(self, hf_dataset, transform=None):
        self.dataset = hf_dataset
        self.transform = transform

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        image = item["image"]
        label = item["label"]
        # ensure RGB for consistent channel count
        image = image.convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image, label

batch_size = 64
trainset = MiniImageNetDataset(ds["train"], transform=train_transform)
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=0)

testset = MiniImageNetDataset(ds["validation"], transform=test_transform)
testloader = DataLoader(testset, batch_size=batch_size*2, shuffle=False, num_workers=0)

In [11]:
images, labels = next(iter(trainloader))
print(images.shape, labels.shape)

torch.Size([64, 3, 84, 84]) torch.Size([64])


In [15]:
!pip install pytorch-lightning>=2.0.0

In [16]:
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torchvision.models as models
from torchmetrics import Accuracy
from torch.optim.lr_scheduler import CosineAnnealingLR

class ImageNetModule(pl.LightningModule):
    def __init__(
        self,
        num_classes: int = 1000,
        learning_rate: float = 0.1,
        weight_decay: float = 1e-4,
        epochs: int = 100,
        batch_size: int = 256
    ):
        super().__init__()
        self.save_hyperparameters()

        # Initialize ResNet50 model
        self.model = models.resnet50(weights=None, num_classes=num_classes)

        # Initialize weights using He initialization
        for m in self.model.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, nn.BatchNorm2d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        # Metrics
        self.train_acc = Accuracy(task='multiclass', num_classes=num_classes, top_k=1)
        self.val_acc = Accuracy(task='multiclass', num_classes=num_classes, top_k=1)

        # Track training speed
        self.batch_start_time = None

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        self.batch_start_time = torch.cuda.Event(enable_timing=True)
        self.batch_end_time = torch.cuda.Event(enable_timing=True)

        self.batch_start_time.record()

        x, y = batch
        logits = self(x)
        loss = nn.functional.cross_entropy(logits, y)

        self.batch_end_time.record()
        torch.cuda.synchronize()
        batch_time = self.batch_start_time.elapsed_time(self.batch_end_time)

        # Calculate images per second
        images_per_second = self.hparams.batch_size / (batch_time / 1000.0)

        # Log metrics
        self.log('train_loss', loss, on_step=True, on_epoch=True)
        self.log('train_acc', self.train_acc(logits, y), on_step=True, on_epoch=True)
        self.log('train_images_per_second', images_per_second, on_step=True)
        self.log('gpu_memory_usage', torch.cuda.memory_allocated() / 1024**2, on_step=True)

        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = nn.functional.cross_entropy(logits, y)

        self.log('val_loss', loss, on_epoch=True, sync_dist=True)
        self.log('val_acc', self.val_acc(logits, y), on_epoch=True, sync_dist=True)

        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.SGD(
            self.parameters(),
            lr=self.hparams.learning_rate,
            momentum=0.9,
            weight_decay=self.hparams.weight_decay
        )

        scheduler = CosineAnnealingLR(
            optimizer,
            T_max=self.hparams.epochs,
            eta_min=0
        )

        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': scheduler,
                'interval': 'epoch',
                'monitor': 'val_loss'
            }
        }

In [None]:
model = ImageNetModule(
        learning_rate=args.learning_rate,
        weight_decay=args.weight_decay,
        epochs=args.epochs,
        batch_size=args.batch_size
    )

In [17]:
model=ImageNetModel()

NameError: name 'ImageNetModel' is not defined

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-7, weight_decay=1e-2)
lr_finder = LRFinder(model, optimizer, criterion, device="cuda")

In [10]:
images, labels = next(iter(trainloader))
print(images.shape, labels.shape)

RuntimeError: stack expects each tensor to be equal size, but got [3, 377, 500] at entry 0 and [3, 500, 333] at entry 1

In [8]:
!pip install openml

Collecting openml
  Downloading openml-0.15.1-py3-none-any.whl.metadata (10 kB)
Collecting liac-arff>=2.4.0 (from openml)
  Downloading liac-arff-2.5.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting xmltodict (from openml)
  Downloading xmltodict-1.0.2-py3-none-any.whl.metadata (15 kB)
Collecting minio (from openml)
  Downloading minio-7.2.18-py3-none-any.whl.metadata (6.5 kB)
Collecting pycryptodome (from minio->openml)
  Downloading pycryptodome-3.23.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Downloading openml-0.15.1-py3-none-any.whl (160 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.4/160.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading minio-7.2.18-py3-none-any.whl (93 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.1/93.1 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xmltodict-1.0.2-py3-none-any.whl (13 kB)
Downloading pycryptodome-3.23

In [7]:
!python downloader.py \
  -data_root /content/imagenet_subset \
  -number_of_classes 10 \
  -images_per_class 100

python3: can't open file '/content/downloader.py': [Errno 2] No such file or directory


In [6]:
training_folder_name = '../imagenet/ILSVRC/Data/CLS-LOC/train'
val_folder_name = '../imagenet/ILSVRC/Data/CLS-LOC/val'
os.listdir(training_folder_name)

FileNotFoundError: [Errno 2] No such file or directory: '../imagenet/ILSVRC/Data/CLS-LOC/train'