In [25]:
# import time
# time.sleep(5000)

### Import packages

In [1]:
%load_ext autoreload
%autoreload 2

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, datasets
from torch.utils.data import DataLoader, Subset
import optuna
from torch.optim import lr_scheduler
import numpy as np
import pandas as pd
from datetime import datetime
import os
import json

from data_loader import get_cifar10_dataloaders, get_test_dataloader, get_kaggle_test_dataloader
from helper import optimizer_map, scheduler_map, num_params, update_study_details
from models import BaseResNet, EfficientNetB0, SmallResNet0, LargeResNet0, SmallResNet1
from trainer import train_model
from run import single_run

  from .autonotebook import tqdm as notebook_tqdm


Configure the device

In [2]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

Using device: cuda


In [5]:
# Define the objective function for Optuna
def objective(trial):
    study_dir = f"studies/{trial.study.study_name}"
    os.makedirs(study_dir, exist_ok=True) # Create a directory for checkpoints if it doesn't exist

    # Suggest hyperparameters
    num_epochs = trial.suggest_int("num_epochs", 50, 200)
    model_type = trial.suggest_categorical("model_type", ["smallresnet", "efficientnet", "largeresnet"])
    batch_size = trial.suggest_categorical("batch_size", [64, 128, 256, 512])
    optimizer_type = trial.suggest_categorical("optimizer_type", ["Adam", "SGD", "AdamW"])
    scheduler_type = trial.suggest_categorical("scheduler_type", ["CosineAnnealingLR", "OneCycleLR", "ReduceLROnPlateau"])
    
    optimizer_params = {}
    if optimizer_type == "SGD":
        optimizer_params["lr"] = trial.suggest_float("learning_rate", 0.001, 0.1, log=True)
        optimizer_params["momentum"] = trial.suggest_float("momentum", 0.8, 0.99)
        optimizer_params["weight_decay"] = trial.suggest_float("weight_decay", 1e-5, 5e-4, log=True)
        scheduler_type = trial.suggest_categorical("scheduler_type", ["CosineAnnealingLR", "OneCycleLR", "ReduceLROnPlateau"])
        optimizer_params["nesterov"] = scheduler_type != "ReduceLROnPlateau"
    
    elif optimizer_type == "Adam":
        optimizer_params["betas"] = (
            trial.suggest_float("beta1", 0.85, 0.95), 
            trial.suggest_float("beta2", 0.99, 0.999)
        )
        optimizer_params["lr"] = trial.suggest_float("learning_rate", 1e-5, 1e-2, log=True)
        optimizer_params["weight_decay"] = trial.suggest_float("weight_decay", 1e-6, 1e-3, log=True)
        if scheduler_type == "OneCycleLR":
            scheduler_type = None

    elif optimizer_type == "AdamW":
        optimizer_params["betas"] = (
            trial.suggest_float("beta1", 0.85, 0.95), 
            trial.suggest_float("beta2", 0.99, 0.999)
        )
        optimizer_params["lr"] = trial.suggest_float("learning_rate", 1e-5, 5e-3, log=True)
        optimizer_params["weight_decay"] = trial.suggest_float("weight_decay", 1e-3, 1e-1, log=True)
        scheduler_type = trial.suggest_categorical("scheduler_type", ["CosineAnnealingLR", "OneCycleLR", "ReduceLROnPlateau"])

    
    # train_transform = transforms.Compose([
    #     transforms.RandomCrop(32, padding=4),
    #     transforms.RandomHorizontalFlip(0.5),
    #     transforms.RandomRotation(15),
    #     transforms.ToTensor(),
    #     transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
    # ])

    # Realistic tranformation for better generalization
    train_transform = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.05),  # Mild color variations
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])
        
    train_loader, valid_loader = get_cifar10_dataloaders(
        train_transform,
        subset_percent=1, 
        valid_size=0.1,
        batch_size=batch_size,
        num_workers=8,
        use_kaggle=True
    )

    scheduler_params = {}
    if scheduler_type == "CosineAnnealingLR":
        # num_epochs = trial.suggest_int("num_epochs", 100, 150)
        scheduler_params["T_max"] = num_epochs
        scheduler_params["eta_min"] = trial.suggest_float("eta_min", 1e-6, 1e-3, log=True)
        
    elif scheduler_type == "ReduceLROnPlateau":
        # num_epochs = trial.suggest_int("num_epochs", 75, 125)
        scheduler_params["factor"] = trial.suggest_float("factor", 0.1, 0.5)
        scheduler_params["patience"] = trial.suggest_int("patience", 5, 20)
        scheduler_params["threshold"] = trial.suggest_float("factor", 0.01, 0.1)
        scheduler_params["mode"] = "min"
        
    elif scheduler_type == "OneCycleLR":
        # num_epochs = trial.suggest_int("num_epochs", 50, 75)
        if optimizer_type == "SGD":
            scheduler_params["max_lr"] = trial.suggest_float("factor", 0.01, 0.3)
        else: # AdamW
            scheduler_params["max_lr"] = trial.suggest_float("factor", 0.001, 0.01)
        scheduler_params["steps_per_epoch"] = len(train_loader)
        scheduler_params["epochs"] = num_epochs
        scheduler_params["anneal_strategy"] = "cos"

                
    # Select Model
    if model_type == "smallresnet":
        model = SmallResNet0()
    elif model_type == "efficientnet":
        model = EfficientNetB0()
    elif model_type == "largeresnet":
        model = LargeResNet0()
    else:
        model = BaseResNet()
        
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    trial_details = trial.params.copy()
    trial_details["trainable_parameters"] = num_params(model)
    
    # Print trial details
    print("-" * 50)
    print(f"{trial.number=}")
    for param, val in trial_details.items():
        print(f"{param}: {val}")
    print("- " * 25)
    update_study_details(study_dir, trial.number, trial_details)

    optimizer = optimizer_map[optimizer_type](model.parameters(), **optimizer_params)
    scheduler = scheduler_map[scheduler_type](optimizer, **scheduler_params) if scheduler_type else None
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)

    # Checkpoint the model with the best validation accuracy
    chkpt_dir = os.path.join(study_dir, "checkpoint")
    plot_dir = os.path.join(study_dir, "plots")
    os.makedirs(chkpt_dir, exist_ok=True)
    os.makedirs(plot_dir, exist_ok=True)

    # Training
    best_val_accuracy = train_model(
        model, train_loader, criterion, optimizer, valid_loader=valid_loader, num_epochs=num_epochs, 
        device=device, scheduler=scheduler, trial=trial, chkpt_dir=chkpt_dir, plot_dir=plot_dir
    )
    
    trial_details["best_val_accuracy"] = best_val_accuracy
    update_study_details(study_dir, trial.number, trial_details)
    return best_val_accuracy

### Start new study

In [None]:
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
study_name = f"study_{timestamp}"

study = optuna.create_study(
    study_name=study_name,
    storage="sqlite:///study.db",
    direction="maximize",
    load_if_exists=True,
)

study.optimize(objective, n_trials=25)

print("Best trial:", study.best_trial.number)
print("Best hyperparameters:", study.best_params)
print("Best validation accuracy:", study.best_value)

### Resume study
Helps run more studies since we only have 4 hour time limit

In [None]:
study_name = "study_2025-03-11_16-30-52"

# Load and continue running trials
study = optuna.create_study(
    study_name=study_name,
    storage="sqlite:///study.db",
    direction="maximize",
    load_if_exists=True
)

study.optimize(objective, n_trials=25)  # Run another batch
print("Continued Study:")
print("Best trial:", study.best_trial.number)
print("Best hyperparameters:", study.best_params)
print("Best validation accuracy:", study.best_value)

[I 2025-03-11 19:25:36,469] Using an existing study with name 'study_2025-03-11_16-30-52' instead of creating a new one.


--------------------------------------------------
trial.number=39
num_epochs: 107
model_type: largeresnet
batch_size: 128
optimizer_type: Adam
scheduler_type: OneCycleLR
beta1: 0.9351259225328429
beta2: 0.9977488472985936
learning_rate: 0.0005225672559860701
weight_decay: 7.013916683605919e-06
trainable_parameters: 4903242
- - - - - - - - - - - - - - - - - - - - - - - - - 
  Epoch [1/107], Batch [350/352], Train Acc: 45.6004 Loss: 1.4767
  Validation Accuracy after Epoch 1: 56.3200
  Cidar10.1 Accuracy: 46.25
  Epoch [2/107], Batch [350/352], Train Acc: 64.2545 Loss: 1.3031
  Validation Accuracy after Epoch 2: 64.8800
  Cidar10.1 Accuracy: 53.4
  Epoch [3/107], Batch [350/352], Train Acc: 72.0357 Loss: 1.2326
  Validation Accuracy after Epoch 3: 73.7800
  Cidar10.1 Accuracy: 61.75
  Epoch [4/107], Batch [350/352], Train Acc: 76.4955 Loss: 1.0721
  Validation Accuracy after Epoch 4: 78.2200
  Cidar10.1 Accuracy: 64.15
  Epoch [5/107], Batch [350/352], Train Acc: 79.2054 Loss: 0.9299
  

### Single Run

In [12]:
# Min Augmentation
# train_transform = transforms.Compose([
#     transforms.RandomCrop(32, padding=4),
#     transforms.RandomHorizontalFlip(),
#     transforms.ToTensor(),
#     transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
# ])


# # Medium Augmentation
# train_transform = transforms.Compose([
#     transforms.RandomCrop(32, padding=4),
#     transforms.RandomHorizontalFlip(0.5),
#     transforms.RandomRotation(15),
#     # transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
#     # transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
#     transforms.ToTensor(),
#     transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))  # Normalize with mean and std of CIFAR-10
# ])

# Realistic tranformation for better generalization
train_transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.05),  # Mild color variations
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])


# # Aggressive Augmentation
# train_transform = transforms.Compose([
#     transforms.RandomCrop(32, padding=4, fill=0),  
#     transforms.RandomHorizontalFlip(p=0.5),  
#     transforms.RandomRotation(degrees=15),  
#     transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1),  
#     transforms.RandomAffine(degrees=10, translate=(0.1, 0.1), scale=(0.8, 1.2), shear=10),  
#     transforms.ToTensor(),
#     transforms.RandomErasing(p=0.5, scale=(0.02, 0.2), ratio=(0.3, 3.3)), # Random Erasing (Mimics `Cutout`)
#     transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
# ])

In [29]:
model = SmallResNet1()
num_epochs = 50

single_run(
    model,
    train_transform,
    num_epochs=num_epochs,
    batch_size=128,
    optimizer_type="SGD",
    optimizer_params={"lr": 0.01, "weight_decay": 5e-4, "momentum": 0.9, "nesterov": True},
    scheduler_type="CosineAnnealingLR",
    scheduler_params={"T_max": num_epochs},
    criterion_params={"label_smoothing": 0.1}
)

  Epoch [1/50], Batch [350/352], Train Acc: 35.8103 Loss: 1.6766
  Validation Accuracy after Epoch 1: 48.5000
  Cidar10.1 Accuracy: 41.25
  Epoch [2/50], Batch [350/352], Train Acc: 52.8326 Loss: 1.5933
  Validation Accuracy after Epoch 2: 44.2400
  Cidar10.1 Accuracy: 36.45
  Epoch [3/50], Batch [350/352], Train Acc: 62.1406 Loss: 1.1996
  Validation Accuracy after Epoch 3: 68.5400
  Cidar10.1 Accuracy: 56.9
  Epoch [4/50], Batch [350/352], Train Acc: 68.7746 Loss: 1.2148
  Validation Accuracy after Epoch 4: 65.1600
  Cidar10.1 Accuracy: 54.65
  Epoch [5/50], Batch [350/352], Train Acc: 72.5781 Loss: 1.1803
  Validation Accuracy after Epoch 5: 75.4200
  Cidar10.1 Accuracy: 64.25
  Epoch [6/50], Batch [350/352], Train Acc: 75.6429 Loss: 1.0801
  Validation Accuracy after Epoch 6: 71.8400
  Cidar10.1 Accuracy: 61.0
  Epoch [7/50], Batch [350/352], Train Acc: 77.6473 Loss: 0.9669
  Validation Accuracy after Epoch 7: 79.2800
  Cidar10.1 Accuracy: 68.5
  Epoch [8/50], Batch [350/352], Trai

88.76

In [None]:
model = LargeResNet0()
num_epochs = 100
lr_min = 1e-6
lr_max = 1e-2
len_train_dataset = 50_000 * 0.9
step_size = (len_train_dataset/64) // 2

single_run(
    model,
    train_transform,
    num_epochs=num_epochs,
    batch_size=128,
    optimizer_type="SGD",
    optimizer_params={"lr": lr_min, "momentum": 0.9, "nesterov": True},
    scheduler_type="CyclicLR",
    scheduler_params={
        "base_lr": lr_min, "max_lr": lr_max, "step_size_up": step_size, 
        "step_size_down": step_size, "gamma": 0.9999, "mode": "exp_range", "cycle_momentum": False
    }
)

### Load Checkpoint

In [5]:
model = SmallResNet0()
model.to(device)

# best_checkpoint_fp = "checkpoints_study_2025-03-10_19-00-59/model_trial_0_val_acc_0.8604.pth"
best_checkpoint_fp = "studies/study_2025-03-11_01-51-41/checkpoint/trial_0_val_acc_SmallResNet_86.7000_2025-03-11_02-15-35.pth"

if not best_checkpoint_fp:
    checkpoint_dir = f"checkpoints_{study_name}"
    with open(os.path.join(checkpoint_dir, "study_details.json"), "r") as f:
        study_details = json.load(f)
    best_checkpoint_fp = study_details[str(study.best_trial.number)]["checkpoint_path"]

# Load the latest checkpoint
checkpoint = torch.load(best_checkpoint_fp)
model.load_state_dict(checkpoint)

<All keys matched successfully>

#### Test on test data

In [35]:
from trainer import evaluate_model
from data_loader import get_test_dataloader

test_loader = get_test_dataloader(use_kaggle=True)
acc, _ = evaluate_model(model, test_loader, device=device)
print("Acc:", acc)

Acc: 83.23


#### Test on cifar10.1 subset

In [36]:
from cifar10_1_dataloader import get_dataloader_10_1
dataloader_10_1 = get_dataloader_10_1()

acc, _ = evaluate_model(model, dataloader_10_1, device)
print("Acc:", acc)

Acc: 72.5


### Run model on Kaggle test data

In [31]:
from data_loader import get_kaggle_test_dataloader

In [33]:
# Generate submission file with test data
kaggle_test_loader = get_kaggle_test_dataloader()

model.eval()
predictions = []

with torch.no_grad():
    for images, in kaggle_test_loader:
        images = images.to(device)
        outputs = model(images) 
        _, predicted = torch.max(outputs, 1)
        predictions.extend(predicted.cpu().numpy())

submission = pd.DataFrame({'ID': np.arange(len(predictions)), 'Labels': predictions})
submission.to_csv('submission.csv', index=False)
print("submission file saved.")

submission file saved.


In [35]:
# import kaggle
# kaggle.api.competition_submit(
#     file_name="submission.csv",
#     message="0.9365",
#     competition="deep-learning-spring-2025-project-1"
# )

100%|██████████| 67.3k/67.3k [00:00<00:00, 330kB/s]


Successfully submitted to Deep Learning Spring 2025: CIFAR 10 classification