In [1]:
# import time
# time.sleep(5000)

### Import packages

In [1]:
%load_ext autoreload
%autoreload 2

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, datasets
from torch.utils.data import DataLoader, Subset
import optuna
from torch.optim import lr_scheduler
import numpy as np
import pandas as pd
from datetime import datetime
import os
import json

from data_loader import get_cifar10_dataloaders, get_test_dataloader, get_kaggle_test_dataloader
from helper import optimizer_map, scheduler_map, num_params, update_study_details
from models import BaseResNet, EfficientNetB0, SmallResNet0, LargeResNet0
from trainer import train_model
from run import single_run

  from .autonotebook import tqdm as notebook_tqdm


Configure the device

In [2]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

Using device: cuda


In [3]:
# Define the objective function for Optuna
def objective(trial):
    study_dir = f"studies/{trial.study.study_name}"
    os.makedirs(study_dir, exist_ok=True) # Create a directory for checkpoints if it doesn't exist

    # Suggest hyperparameters
    num_epochs = 250 # trial.suggest_int("num_epochs", 20, 35)
    model_type = trial.suggest_categorical("model_type", ["smallresnet", "efficientnet"])
    batch_size = trial.suggest_categorical("batch_size", [64, 128, 256])
    optimizer_type = trial.suggest_categorical("optimizer_type", ["Adam", "SGD"]) # Rmed: RMSprop
    scheduler_type = trial.suggest_categorical("scheduler_type", ["CosineAnnealingLR", "ReduceLROnPlateau", "OneCycleLR"]) # Rmed: StepLR
    
    optimizer_params = {
        "weight_decay": trial.suggest_categorical("weight_decay", [1e-4, 5e-4])
    }
    
    if optimizer_type == "SGD":
        optimizer_params["lr"] = trial.suggest_float("learning_rate", 0.01, 0.1, log=True)
        optimizer_params["momentum"] = 0.9 # trial.suggest_float("momentum", 0.8, 0.9)
        optimizer_params["nesterov"] = True #bool(trial.suggest_categorical("nesterov", [0, 1]))
        optimizer_params["weight_decay"] = 5e-4
    else:
        optimizer_params["lr"] = trial.suggest_float("learning_rate", 0.0001, 0.001, log=True)
        optimizer_params["weight_decay"] = 1e-4
        
    train_transform = transforms.Compose([
        transforms.RandomCrop(32, padding=4, fill=0),  
        transforms.RandomHorizontalFlip(p=0.5),  
        transforms.RandomRotation(degrees=15),  
        transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1),  
        transforms.RandomAffine(degrees=10, translate=(0.1, 0.1), scale=(0.8, 1.2), shear=10),  
        transforms.ToTensor(),
        transforms.RandomErasing(p=0.5, scale=(0.02, 0.2), ratio=(0.3, 3.3)),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
    ])
    
    train_loader, valid_loader = get_cifar10_dataloaders(
        train_transform,
        subset_percent=1, 
        valid_size=0.1,
        batch_size=batch_size,
        num_workers=8,
        use_kaggle=True
    )

    scheduler_params = {}
    if scheduler_type == "StepLR":
        # num_epochs = trial.suggest_int("num_epochs", 150, 200)
        scheduler_params["step_size"] = trial.suggest_int("step_size", 5, 20)
        scheduler_params["gamma"] = trial.suggest_float("gamma", 0.1, 0.9)
        
    elif scheduler_type == "CosineAnnealingLR":
        # num_epochs = trial.suggest_int("num_epochs", 100, 150)
        scheduler_params["T_max"] = num_epochs
        
    elif scheduler_type == "ReduceLROnPlateau":
        # num_epochs = trial.suggest_int("num_epochs", 75, 125)
        scheduler_params["factor"] = trial.suggest_float("factor", 0.1, 0.9)
        scheduler_params["patience"] = trial.suggest_int("patience", 2, 10)
        scheduler_params["mode"] = "min"
        
    elif scheduler_type == "OneCycleLR":
        # num_epochs = trial.suggest_int("num_epochs", 50, 75)
        scheduler_params["max_lr"] = 0.1
        scheduler_params["steps_per_epoch"] = len(train_loader)
        scheduler_params["epochs"] = num_epochs
            
    # Select Model
    if model_type == "smallresnet":
        model = SmallResNet0()
    elif model_type == "efficientnet":
        model = EfficientNetB0()
    else:
        model = BaseResNet()
        
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    trial_details = trial.params.copy()
    trial_details["model_name"] = model.__class__.__name__
    trial_details["trainable_parameters"] = num_params(model)
    
    # Print trial details
    print("-" * 50)
    print(f"{trial.number=}")
    for param, val in trial_details.items():
        print(f"{param}: {val}")
    print("- " * 25)
    update_study_details(study_dir, trial.number, trial_details)

    optimizer = optimizer_map[optimizer_type](model.parameters(), **optimizer_params)
    scheduler = scheduler_map[scheduler_type](optimizer, **scheduler_params)
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)

    # Checkpoint the model with the best validation accuracy
    chkpt_dir = os.path.join(study_dir, "checkpoint")
    plot_dir = os.path.join(study_dir, "plots")
    os.makedirs(chkpt_dir, exist_ok=True)
    os.makedirs(plot_dir, exist_ok=True)

    # Training
    best_val_accuracy = train_model(
        model, train_loader, criterion, optimizer, valid_loader=valid_loader, num_epochs=num_epochs, 
        device=device, scheduler=scheduler, trial=trial, chkpt_dir=chkpt_dir, plot_dir=plot_dir
    )
    
    trial_details["best_val_accuracy"] = best_val_accuracy
    update_study_details(study_dir, trial.number, trial_details)
    return best_val_accuracy

### Start new study

In [None]:
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
study_name = f"study_{timestamp}"

study = optuna.create_study(
    study_name=study_name,
    storage="sqlite:///study.db",
    direction="maximize",
    load_if_exists=True,
)

study.optimize(objective, n_trials=5)

print("Best trial:", study.best_trial.number)
print("Best hyperparameters:", study.best_params)
print("Best validation accuracy:", study.best_value)

[I 2025-03-11 01:51:42,621] A new study created in RDB with name: study_2025-03-11_01-51-41


--------------------------------------------------
trial.number=0
model_type: smallresnet
batch_size: 128
optimizer_type: Adam
scheduler_type: ReduceLROnPlateau
weight_decay: 0.0005
learning_rate: 0.00032214137835438034
num_epochs: 115
factor: 0.3825196441327009
patience: 6
model_name: SmallResNet
trainable_parameters: 2998402
- - - - - - - - - - - - - - - - - - - - - - - - - 
  Epoch [1/115], Batch [350/352], Train Acc: 29.2232 Loss: 1.8798
  Validation Accuracy after Epoch 1: 37.3200
  Epoch [2/115], Batch [350/352], Train Acc: 41.5246 Loss: 1.6083
  Validation Accuracy after Epoch 2: 44.4000
  Epoch [3/115], Batch [350/352], Train Acc: 48.6205 Loss: 1.5436
  Validation Accuracy after Epoch 3: 47.8200
  Epoch [4/115], Batch [350/352], Train Acc: 53.0580 Loss: 1.6191
  Validation Accuracy after Epoch 4: 51.8200
  Epoch [5/115], Batch [350/352], Train Acc: 56.9330 Loss: 1.5187
  Validation Accuracy after Epoch 5: 54.1400
  Epoch [6/115], Batch [350/352], Train Acc: 60.1250 Loss: 1.2948

[I 2025-03-11 02:15:35,733] Trial 0 finished with value: 86.7 and parameters: {'model_type': 'smallresnet', 'batch_size': 128, 'optimizer_type': 'Adam', 'scheduler_type': 'ReduceLROnPlateau', 'weight_decay': 0.0005, 'learning_rate': 0.00032214137835438034, 'num_epochs': 115, 'factor': 0.3825196441327009, 'patience': 6}. Best is trial 0 with value: 86.7.


--------------------------------------------------
trial.number=1
model_type: efficientnet
batch_size: 256
optimizer_type: Adam
scheduler_type: ReduceLROnPlateau
weight_decay: 0.0005
learning_rate: 0.0003941830586218311
num_epochs: 92
factor: 0.12924857724297942
patience: 6
model_name: EfficientNet
trainable_parameters: 3599686
- - - - - - - - - - - - - - - - - - - - - - - - - 
  Epoch [1/92], Batch [170/176], Train Acc: 20.1333 Loss: 2.0302
  Validation Accuracy after Epoch 1: 26.9000
  Epoch [2/92], Batch [170/176], Train Acc: 29.9563 Loss: 1.8351
  Validation Accuracy after Epoch 2: 33.2200
  Epoch [3/92], Batch [170/176], Train Acc: 35.9283 Loss: 1.8577
  Validation Accuracy after Epoch 3: 38.8200
  Epoch [4/92], Batch [170/176], Train Acc: 39.8966 Loss: 1.7842
  Validation Accuracy after Epoch 4: 41.0800
  Epoch [5/92], Batch [170/176], Train Acc: 43.2950 Loss: 1.6887
  Validation Accuracy after Epoch 5: 44.2200
  Epoch [6/92], Batch [170/176], Train Acc: 45.8019 Loss: 1.5933
  Va

### Resume study
Helps run more studies since we only have 4 hour time limit

In [9]:
study_name = "study_2025-03-11_01-51-41"

# Load and continue running trials
study = optuna.create_study(
    study_name=study_name,
    storage="sqlite:///study.db",
    direction="maximize",
    load_if_exists=True
)

study.optimize(objective, n_trials=5)  # Run another batch
print("Continued Study:")
print("Best trial:", study.best_trial.number)
print("Best hyperparameters:", study.best_params)
print("Best validation accuracy:", study.best_value)

[I 2025-03-11 08:27:39,067] Using an existing study with name 'study_2025-03-11_01-51-41' instead of creating a new one.


--------------------------------------------------
trial.number=5
model_type: efficientnet
batch_size: 64
optimizer_type: Adam
scheduler_type: ReduceLROnPlateau
weight_decay: 0.0001
learning_rate: 0.0005113906292921991
factor: 0.13060806437027034
patience: 9
model_name: EfficientNet
trainable_parameters: 3599686
- - - - - - - - - - - - - - - - - - - - - - - - - 
  Epoch [1/250], Batch [700/704], Train Acc: 25.2210 Loss: 1.8978
  Validation Accuracy after Epoch 1: 32.8600
  Epoch [2/250], Batch [700/704], Train Acc: 36.2232 Loss: 1.5889
  Validation Accuracy after Epoch 2: 40.1200
  Epoch [3/250], Batch [700/704], Train Acc: 42.4107 Loss: 1.6440
  Validation Accuracy after Epoch 3: 46.6800
  Epoch [4/250], Batch [700/704], Train Acc: 47.7969 Loss: 1.4218
  Validation Accuracy after Epoch 4: 50.4800
  Epoch [5/250], Batch [700/704], Train Acc: 52.0335 Loss: 1.5002
  Validation Accuracy after Epoch 5: 52.7600
  Epoch [6/250], Batch [700/704], Train Acc: 54.8326 Loss: 1.2450
  Validation A

[I 2025-03-11 09:13:30,198] Trial 5 pruned. 


--------------------------------------------------
trial.number=6
model_type: efficientnet
batch_size: 128
optimizer_type: Adam
scheduler_type: OneCycleLR
weight_decay: 0.0005
learning_rate: 0.0008843166259581984
model_name: EfficientNet
trainable_parameters: 3599686
- - - - - - - - - - - - - - - - - - - - - - - - - 
  Epoch [1/250], Batch [350/352], Train Acc: 23.2478 Loss: 1.8883
  Validation Accuracy after Epoch 1: 31.0800
  Epoch [2/250], Batch [350/352], Train Acc: 34.7188 Loss: 1.5569
  Validation Accuracy after Epoch 2: 39.1000
  Epoch [3/250], Batch [350/352], Train Acc: 43.9152 Loss: 1.7054
  Validation Accuracy after Epoch 3: 43.8200
  Epoch [4/250], Batch [350/352], Train Acc: 49.5246 Loss: 1.4229
  Validation Accuracy after Epoch 4: 49.6200
  Epoch [5/250], Batch [350/352], Train Acc: 53.7679 Loss: 1.5889
  Validation Accuracy after Epoch 5: 54.0200
  Epoch [6/250], Batch [350/352], Train Acc: 56.6987 Loss: 1.3183
  Validation Accuracy after Epoch 6: 56.6200
  Epoch [7/250]

[I 2025-03-11 09:21:18,173] Trial 6 pruned. 


--------------------------------------------------
trial.number=7
model_type: smallresnet
batch_size: 256
optimizer_type: Adam
scheduler_type: OneCycleLR
weight_decay: 0.0001
learning_rate: 0.0005112796186151655
model_name: SmallResNet
trainable_parameters: 2998402
- - - - - - - - - - - - - - - - - - - - - - - - - 
  Epoch [1/250], Batch [170/176], Train Acc: 23.2996 Loss: 1.9870
  Validation Accuracy after Epoch 1: 27.2000
  Trial pruned due to no improvement.


[I 2025-03-11 09:21:31,567] Trial 7 pruned. 


--------------------------------------------------
trial.number=8
model_type: efficientnet
batch_size: 128
optimizer_type: Adam
scheduler_type: CosineAnnealingLR
weight_decay: 0.0001
learning_rate: 0.00025033043537248264
model_name: EfficientNet
trainable_parameters: 3599686
- - - - - - - - - - - - - - - - - - - - - - - - - 
  Epoch [1/250], Batch [350/352], Train Acc: 17.6138 Loss: 2.1723
  Validation Accuracy after Epoch 1: 20.6200
  Trial pruned due to no improvement.


[I 2025-03-11 09:21:50,389] Trial 8 pruned. 


--------------------------------------------------
trial.number=9
model_type: efficientnet
batch_size: 64
optimizer_type: SGD
scheduler_type: OneCycleLR
weight_decay: 0.0001
learning_rate: 0.01570247106382465
model_name: EfficientNet
trainable_parameters: 3599686
- - - - - - - - - - - - - - - - - - - - - - - - - 
  Epoch [1/250], Batch [700/704], Train Acc: 27.4844 Loss: 1.8903
  Validation Accuracy after Epoch 1: 35.0600
  Epoch [2/250], Batch [700/704], Train Acc: 38.6808 Loss: 1.6047
  Validation Accuracy after Epoch 2: 39.4200
  Epoch [3/250], Batch [700/704], Train Acc: 45.3504 Loss: 1.7497
  Validation Accuracy after Epoch 3: 48.9600
  Epoch [4/250], Batch [700/704], Train Acc: 50.2254 Loss: 1.3604
  Validation Accuracy after Epoch 4: 50.9200
  Epoch [5/250], Batch [700/704], Train Acc: 53.3839 Loss: 1.4715
  Validation Accuracy after Epoch 5: 55.7600
  Epoch [6/250], Batch [700/704], Train Acc: 56.3504 Loss: 1.2240
  Validation Accuracy after Epoch 6: 56.9800
  Epoch [7/250], Ba

[I 2025-03-11 09:30:30,570] Trial 9 pruned. 


Continued Study:
Best trial: 0
Best hyperparameters: {'model_type': 'smallresnet', 'batch_size': 128, 'optimizer_type': 'Adam', 'scheduler_type': 'ReduceLROnPlateau', 'weight_decay': 0.0005, 'learning_rate': 0.00032214137835438034, 'num_epochs': 115, 'factor': 0.3825196441327009, 'patience': 6}
Best validation accuracy: 86.7


### Single Run

In [4]:
# train_transform = transforms.Compose([
#     transforms.RandomCrop(32, padding=4),
#     transforms.RandomHorizontalFlip(0.5),
#     transforms.RandomRotation(15),
#     transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
#     transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
#     transforms.ToTensor(),
#     transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))  # Normalize with mean and std of CIFAR-10
# ])

# train_transform = transforms.Compose([
#     transforms.RandomCrop(32, padding=4),
#     transforms.RandomHorizontalFlip(),
#     transforms.ToTensor(),
#     transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
# ])

# Aggressive Augmentation
train_transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4, fill=0),  
    transforms.RandomHorizontalFlip(p=0.5),  
    transforms.RandomRotation(degrees=15),  
    transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1),  
    transforms.RandomAffine(degrees=10, translate=(0.1, 0.1), scale=(0.8, 1.2), shear=10),  
    transforms.ToTensor(),
    transforms.RandomErasing(p=0.5, scale=(0.02, 0.2), ratio=(0.3, 3.3)), # Random Erasing (Mimics `Cutout`)
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

In [11]:
model = LargeResNet0()
num_epochs = 30
lr_min = 1e-6
lr_max = 1e-2
epochs = 30
len_train_dataset = 50_000 * 0.9
step_size = (len_train_dataset/64) // 2

single_run(
    model,
    train_transform,
    num_epochs=num_epochs,
    batch_size=128,
    optimizer_type="SGD",
    optimizer_params={"lr": lr_min, "momentum": 0.9, "nesterov": True},
    scheduler_type="CyclicLR",
    scheduler_params={
        "base_lr": lr_min, "max_lr": lr_max, "step_size_up": step_size, 
        "step_size_down": step_size, "gamma": 0.9999, "mode": "exp_range", "cycle_momentum": False
    }
)

NameError: name 'step_size_up' is not defined

### Load Checkpoint

In [5]:
model = SmallResNet0()
model.to(device)

# best_checkpoint_fp = "checkpoints_study_2025-03-10_19-00-59/model_trial_0_val_acc_0.8604.pth"
best_checkpoint_fp = "studies/study_2025-03-11_01-51-41/checkpoint/trial_0_val_acc_SmallResNet_86.7000_2025-03-11_02-15-35.pth"

if not best_checkpoint_fp:
    checkpoint_dir = f"checkpoints_{study_name}"
    with open(os.path.join(checkpoint_dir, "study_details.json"), "r") as f:
        study_details = json.load(f)
    best_checkpoint_fp = study_details[str(study.best_trial.number)]["checkpoint_path"]

# Load the latest checkpoint
checkpoint = torch.load(best_checkpoint_fp)
model.load_state_dict(checkpoint)

<All keys matched successfully>

#### Test on test data

In [6]:
from trainer import evaluate_model
from data_loader import get_test_dataloader

test_loader = get_test_dataloader(use_kaggle=True)
acc, _ = evaluate_model(model, test_loader, device=device)
print("Acc:", acc)

Acc: 77.65


#### Test on cifar10.1 subset

In [7]:
from cifar10_1_dataloader import get_dataloader_10_1
dataloader_10_1 = get_dataloader_10_1()

acc, _ = evaluate_model(model, dataloader_10_1, device)
print("Acc:", acc)

Acc: 65.6


### Run model on Kaggle test data

In [31]:
from data_loader import get_kaggle_test_dataloader

In [33]:
# Generate submission file with test data
kaggle_test_loader = get_kaggle_test_dataloader()

model.eval()
predictions = []

with torch.no_grad():
    for images, in kaggle_test_loader:
        images = images.to(device)
        outputs = model(images) 
        _, predicted = torch.max(outputs, 1)
        predictions.extend(predicted.cpu().numpy())

submission = pd.DataFrame({'ID': np.arange(len(predictions)), 'Labels': predictions})
submission.to_csv('submission.csv', index=False)
print("submission file saved.")

submission file saved.


In [35]:
# import kaggle
# kaggle.api.competition_submit(
#     file_name="submission.csv",
#     message="0.9365",
#     competition="deep-learning-spring-2025-project-1"
# )

100%|██████████| 67.3k/67.3k [00:00<00:00, 330kB/s]


Successfully submitted to Deep Learning Spring 2025: CIFAR 10 classification