In [2]:
# import time
# time.sleep(5000)

### Import packages

In [3]:
%load_ext autoreload
%autoreload 2

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, datasets
from torch.utils.data import DataLoader, Subset
import optuna
from torch.optim import lr_scheduler
import numpy as np
import pandas as pd
from datetime import datetime
import os
import json

from data_loader import get_cifar10_dataloaders, get_test_dataloader, get_kaggle_test_dataloader
from helper import optimizer_map, scheduler_map, num_params, update_study_details
from models import BaseResNet, EfficientNetB0, SmallResNet0
from trainer import train_model
from run import single_run

  from .autonotebook import tqdm as notebook_tqdm


Configure the device

In [4]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

model = SmallResNet0()
print("Trainable parameters:", num_params(model))

Using device: cuda
Trainable parameters: 2998402


In [13]:
# Define the objective function for Optuna
def objective(trial):
    study_dir = f"studies/{trial.study.study_name}"
    os.makedirs(study_dir, exist_ok=True) # Create a directory for checkpoints if it doesn't exist

    # Suggest hyperparameters
    num_epochs = 150 # trial.suggest_int("num_epochs", 20, 35)
    batch_size = trial.suggest_categorical("batch_size", [64, 128]) # Rmed: 256
    optimizer_type = trial.suggest_categorical("optimizer_type", ["Adam", "SGD"]) # Rmed: RMSprop
    scheduler_type = trial.suggest_categorical("scheduler_type", ["CosineAnnealingLR", "ReduceLROnPlateau", "OneCycleLR"]) # Rmed: StepLR
    
    optimizer_params = {
        "weight_decay": trial.suggest_categorical("weight_decay", [1e-4, 5e-4])
    }
    
    if optimizer_type == "SGD":
        optimizer_params["lr"] = trial.suggest_float("learning_rate", 0.01, 0.05, log=True)
        optimizer_params["momentum"] = 0.9 # trial.suggest_float("momentum", 0.8, 0.9)
        optimizer_params["nesterov"] = True #bool(trial.suggest_categorical("nesterov", [0, 1]))
        optimizer_params["weight_decay"] = 5e-4
    else:
        optimizer_params["lr"] = trial.suggest_float("learning_rate", 0.0001, 0.001, log=True)
        optimizer_params["weight_decay"] = 1e-4
        
    train_transform = transforms.Compose([
        # Random Cropping with Padding (Mimics Albumentations `PadIfNeeded`)
        transforms.RandomCrop(32, padding=4, fill=0),  
        
        # Random Horizontal Flip
        transforms.RandomHorizontalFlip(p=0.5),  
        
        # Random Rotation (Limited to ±15 degrees)
        transforms.RandomRotation(degrees=15),  
        
        # Color Jitter (Brightness, Contrast, Saturation, Hue)
        transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1),  
        
        # Random Affine (Mimics `ShiftScaleRotate`)
        transforms.RandomAffine(degrees=10, translate=(0.1, 0.1), scale=(0.8, 1.2), shear=10),  
        
        transforms.ToTensor(),
        transforms.RandomErasing(p=0.5, scale=(0.02, 0.2), ratio=(0.3, 3.3)), # Random Erasing (Mimics `Cutout`)
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
    ])
    
    train_loader, valid_loader = get_cifar10_dataloaders(
        train_transform,
        subset_percent=1, 
        valid_size=0.1,
        batch_size=batch_size,
        num_workers=4,
        use_kaggle=False
    )

    scheduler_params = {}
    if scheduler_type == "StepLR":
        num_epochs = trial.suggest_int("num_epochs", 150, 200)
        scheduler_params["step_size"] = trial.suggest_int("step_size", 5, 20)
        scheduler_params["gamma"] = trial.suggest_float("gamma", 0.1, 0.9)
        
    elif scheduler_type == "CosineAnnealingLR":
        num_epochs = trial.suggest_int("num_epochs", 100, 150)
        scheduler_params["T_max"] = num_epochs #trial.suggest_int("T_max", 10, 50)
        scheduler_params["eta_min"] = trial.suggest_float("eta_min", 0.0, 1e-6)
        
    elif scheduler_type == "ReduceLROnPlateau":
        num_epochs = trial.suggest_int("num_epochs", 75, 125)
        scheduler_params["factor"] = trial.suggest_float("factor", 0.1, 0.9)
        scheduler_params["patience"] = trial.suggest_int("patience", 2, 10)
        scheduler_params["mode"] = "min"
        
    elif scheduler_type == "OneCycleLR":
        num_epochs = trial.suggest_int("num_epochs", 50, 75)
        scheduler_params["max_lr"] = 0.1
        scheduler_params["steps_per_epoch"] = len(train_loader)
        scheduler_params["epochs"] = num_epochs

    # Define model
    model = ResNet18()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    trial_details = trial.params.copy()
    trial_details["model_name"] =  "resnet18"
    trial_details["trainable_parameters"] = num_params(model)
    
    # Print trial details
    print("-" * 50)
    print(f"{trial.number=}")
    for param, val in trial_details.items():
        print(f"{param}: {val}")
    print("- " * 25)
    update_study_details(study_dir, trial.number, trial_details)

    optimizer = optimizer_map[optimizer_type](model.parameters(), **optimizer_params)
    scheduler = scheduler_map[scheduler_type](optimizer, **scheduler_params)
    criterion = nn.CrossEntropyLoss()

    # Checkpoint the model with the best validation accuracy
    chkpt_filename = f"trial_{trial.number}_val_acc_{best_val_accuracy:.4f}.pth"
    chkpt_fp = os.path.join(study_dir, model_filename)

    # Training
    best_val_accuracy = train_model(
        model, train_loader, criterion, optimizer, 
        valid_loader=valid_loader, num_epochs=num_epochs, 
        device=device, scheduler=scheduler, trial=trial, chkpt_fp=chkpt_fp
    )
    
    trial_details["best_val_accuracy"] = best_val_accuracy
    trial_details["checkpoint_path"] = model_path
    update_study_details(study_dir, trial.number, trial_details)
    
    return best_val_accuracy

### Start new study

In [1]:
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
study_name = f"study_{timestamp}"

study = optuna.create_study(
    study_name=study_name,
    storage="sqlite:///study.db",
    direction="maximize",
    load_if_exists=True,
)

study.optimize(objective, n_trials=5)

print("Best trial:", study.best_trial.number)
print("Best hyperparameters:", study.best_params)
print("Best validation accuracy:", study.best_value)

### Resume study
Helps run more studies since we only have 4 hour time limit

In [2]:
# Load and continue running trials
study = optuna.create_study(
    study_name=study_name,
    storage="sqlite:///study.db",
    direction="maximize",
    load_if_exists=True
)

study.optimize(objective, n_trials=5)  # Run another batch
print("Continued Study:")
print("Best trial:", study.best_trial.number)
print("Best hyperparameters:", study.best_params)
print("Best validation accuracy:", study.best_value)

### Single Run

In [14]:
# train_transform = transforms.Compose([
#     transforms.RandomCrop(32, padding=4),
#     transforms.RandomHorizontalFlip(0.5),
#     transforms.RandomRotation(15),
#     transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
#     transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
#     transforms.ToTensor(),
#     transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))  # Normalize with mean and std of CIFAR-10
# ])

# Aggressive Augmentation
train_transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4, fill=0),  
    transforms.RandomHorizontalFlip(p=0.5),  
    transforms.RandomRotation(degrees=15),  
    transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1),  
    transforms.RandomAffine(degrees=10, translate=(0.1, 0.1), scale=(0.8, 1.2), shear=10),  
    transforms.ToTensor(),
    transforms.RandomErasing(p=0.5, scale=(0.02, 0.2), ratio=(0.3, 3.3)), # Random Erasing (Mimics `Cutout`)
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

In [None]:
model = SmallResNet0()
num_epochs = 30

single_run(
    model,
    train_transform,
    num_epochs=num_epochs,
    batch_size=128,
    optimizer_type="SGD",
    optimizer_params={"lr": 0.01, "weight_decay": 5e-4, "momentum": 0.9},
    scheduler_type="CosineAnnealingLR",
    scheduler_params={"T_max": num_epochs}
)

  Epoch [1/30], Batch [350/352], Train Acc: 0.2524 Loss: 1.8914
  Validation Accuracy after Epoch 1: 0.3226
  Epoch [2/30], Batch [350/352], Train Acc: 0.3479 Loss: 1.5866
  Validation Accuracy after Epoch 2: 0.3732
  Epoch [3/30], Batch [350/352], Train Acc: 0.4121 Loss: 1.5375
  Validation Accuracy after Epoch 3: 0.4736
  Epoch [4/30], Batch [350/352], Train Acc: 0.4751 Loss: 1.7242
  Validation Accuracy after Epoch 4: 0.4838
  Epoch [5/30], Batch [350/352], Train Acc: 0.5137 Loss: 1.3794
  Validation Accuracy after Epoch 5: 0.5136
  Epoch [6/30], Batch [350/352], Train Acc: 0.5491 Loss: 1.2662
  Validation Accuracy after Epoch 6: 0.5108
  Epoch [7/30], Batch [350/352], Train Acc: 0.5721 Loss: 1.1457
  Validation Accuracy after Epoch 7: 0.5318
  Epoch [8/30], Batch [350/352], Train Acc: 0.5942 Loss: 1.1658
  Validation Accuracy after Epoch 8: 0.4618
  Epoch [9/30], Batch [350/352], Train Acc: 0.6137 Loss: 1.0630
  Validation Accuracy after Epoch 9: 0.6036
  Epoch [10/30], Batch [350/

### Load Checkpoint

In [5]:
model = SmallResNet0()
model.to(device)

best_checkpoint_fp = "checkpoints_study_2025-03-10_19-00-59/model_trial_0_val_acc_0.8604.pth"

if not best_checkpoint_fp:
    checkpoint_dir = f"checkpoints_{study_name}"
    with open(os.path.join(checkpoint_dir, "study_details.json"), "r") as f:
        study_details = json.load(f)
    best_checkpoint_fp = study_details[str(study.best_trial.number)]["checkpoint_path"]

# Load the latest checkpoint
checkpoint = torch.load(best_checkpoint_fp)
model.load_state_dict(checkpoint)

<All keys matched successfully>

#### Test on test data

In [6]:
from trainer import evaluate_model
from data_loader import get_test_dataloader

test_loader = get_test_dataloader(use_kaggle=False)
acc = evaluate_model(model, test_loader, device)
print("Acc:", acc)

Acc: 0.9128


#### Test on cifar10.1 subset

In [7]:
from cifar10_1_dataloader import get_dataloader_10_1
dataloader_10_1 = get_dataloader_10_1()

acc = evaluate_model(model, dataloader_10_1, device)
print("Acc:", acc)

Acc: 0.835


### Run model on Kaggle test data

In [31]:
from data_loader import get_kaggle_test_dataloader

In [33]:
# Generate submission file with test data
kaggle_test_loader = get_kaggle_test_dataloader()

model.eval()
predictions = []

with torch.no_grad():
    for images, in kaggle_test_loader:
        images = images.to(device)
        outputs = model(images) 
        _, predicted = torch.max(outputs, 1)
        predictions.extend(predicted.cpu().numpy())

submission = pd.DataFrame({'ID': np.arange(len(predictions)), 'Labels': predictions})
submission.to_csv('submission.csv', index=False)
print("submission file saved.")

submission file saved.


In [35]:
# import kaggle
# kaggle.api.competition_submit(
#     file_name="submission.csv",
#     message="0.9365",
#     competition="deep-learning-spring-2025-project-1"
# )

100%|██████████| 67.3k/67.3k [00:00<00:00, 330kB/s]


Successfully submitted to Deep Learning Spring 2025: CIFAR 10 classification