In [1]:
# import time
# time.sleep(5000)

### Import packages

In [2]:
%load_ext autoreload
%autoreload 2

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, datasets
from torch.utils.data import DataLoader, Subset
import optuna
from torch.optim import lr_scheduler # StepLR, CosineAnnealingLR, ReduceLROnPlateau
import numpy as np
import pandas as pd
from datetime import datetime
import os
import json

from data_loader import get_cifar10_dataloaders
from trainer import train_model
from model import ResNet18

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Configure the device

In [3]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

Using device: cuda


In [4]:
# See the total number of trainable parameters
def num_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [5]:
model = ResNet18()
num_params(model)

4903242

In [6]:
def update_study_details(checkpoint_dir, trial_num, trial_details):
    file_path = os.path.join(checkpoint_dir, "study_details.json")
    
    if not os.path.exists(file_path):
        with open(file_path, "w") as f:
            json.dump({}, f, indent=4)
    
    with open(file_path, "r") as f:
        study_details = json.load(f)

    study_details[str(trial_num)] = trial_details
    
    with open(file_path, "w") as f:
        json.dump(study_details, f, indent=4)

In [27]:
# Define the objective function for Optuna
def objective(trial):
    study_name = trial.study.study_name
    checkpoint_dir = f"checkpoints_{study_name}"
    os.makedirs(checkpoint_dir, exist_ok=True) # Create a directory for checkpoints if it doesn't exist

    # Suggest hyperparameters
    num_epochs = 150 # trial.suggest_int("num_epochs", 20, 35)
    batch_size = trial.suggest_categorical("batch_size", [64, 128]) # Rmed: 256
    optimizer_type = trial.suggest_categorical("optimizer_type", ["Adam", "SGD"]) # Rmed: RMSprop
    scheduler_type = trial.suggest_categorical("scheduler_type", ["CosineAnnealingLR", "ReduceLROnPlateau", "OneCycleLR"]) # Rmed: StepLR

    optimizer_map = {
        "Adam": optim.AdamW,
        "SGD": optim.SGD,
        "RMSprop": optim.RMSprop
    }

    scheduler_map = {
        "StepLR": lr_scheduler.StepLR,
        "CosineAnnealingLR": lr_scheduler.CosineAnnealingLR,
        "ReduceLROnPlateau": lr_scheduler.ReduceLROnPlateau,
        "OneCycleLR": lr_scheduler.OneCycleLR
    }
    
    optimizer_params = {
        "weight_decay": trial.suggest_categorical("weight_decay", [1e-4, 5e-4])
    }
    
    if optimizer_type == "SGD":
        optimizer_params["lr"] = trial.suggest_float("learning_rate", 0.01, 0.05, log=True)
        optimizer_params["momentum"] = 0.9 # trial.suggest_float("momentum", 0.8, 0.9)
        optimizer_params["nesterov"] = True #bool(trial.suggest_categorical("nesterov", [0, 1]))
        optimizer_params["weight_decay"] = 5e-4
    else:
        optimizer_params["lr"] = trial.suggest_float("learning_rate", 0.0001, 0.001, log=True)
        optimizer_params["weight_decay"] = 1e-4
        
    # Suggest data transformations
    transform = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(trial.suggest_float("h_flip", 0.0, 1.0)),
        transforms.RandomRotation(trial.suggest_int("rotation", 0, 30)),
        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
        transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))  # Normalize with mean and std of CIFAR-10
    ])
    
    train_loader, valid_loader = get_cifar10_dataloaders(
        transform,
        subset_percent=1, 
        valid_size=0.1,
        batch_size=batch_size,
        num_workers=4
    )

    scheduler_params = {}
    if scheduler_type == "StepLR":
        scheduler_params["step_size"] = trial.suggest_int("step_size", 5, 20)
        scheduler_params["gamma"] = trial.suggest_float("gamma", 0.1, 0.9)
    elif scheduler_type == "CosineAnnealingLR":
        scheduler_params["T_max"] = num_epochs #trial.suggest_int("T_max", 10, 50)
        scheduler_params["eta_min"] = trial.suggest_float("eta_min", 0.0, 1e-6)
    elif scheduler_type == "ReduceLROnPlateau":
        scheduler_params["factor"] = trial.suggest_float("factor", 0.1, 0.9)
        scheduler_params["patience"] = trial.suggest_int("patience", 2, 10)
        scheduler_params["mode"] = "min"
    elif scheduler_type == "OneCycleLR":
        scheduler_params["max_lr"] = 0.1
        scheduler_params["steps_per_epoch"] = len(train_loader)
        scheduler_params["epochs"] = num_epochs

    # Define model
    model = ResNet18()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    trial_details = trial.params.copy()
    trial_details["model_name"] =  "resnet18"
    trial_details["trainable_parameters"] = num_params(model)
    
    # Print trial details
    print("-" * 50)
    print(f"{trial.number=}")
    for param, val in trial_details.items():
        print(f"{param}: {val}")
    print("- " * 25)
    update_study_details(checkpoint_dir, trial.number, trial_details)

    optimizer = optimizer_map[optimizer_type](model.parameters(), **optimizer_params)
    scheduler = scheduler_map[scheduler_type](optimizer, **scheduler_params)
    criterion = nn.CrossEntropyLoss()

    # Training
    best_val_accuracy = train_model(
        trial, model, train_loader, criterion, optimizer, 
        valid_loader=valid_loader, num_epochs=num_epochs, device=device,
        scheduler=scheduler
    )

    # Checkpoint the model with the best validation accuracy
    model_filename = f"model_trial_{trial.number}_val_acc_{best_val_accuracy:.4f}.pth"
    model_path = os.path.join(checkpoint_dir, model_filename)
    
    # Save the model state_dict
    torch.save(model.state_dict(), model_path)
    print(f"Model checkpoint saved to {model_path}")
    trial_details["best_val_accuracy"] = best_val_accuracy
    trial_details["checkpoint_path"] = model_path
    update_study_details(checkpoint_dir, trial.number, trial_details)
    
    return best_val_accuracy

In [28]:
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
study_name = f"study_{timestamp}"    

study = optuna.create_study(direction="maximize", study_name=study_name)
study.optimize(objective, n_trials=1)

print("Best trial:", study.best_trial.number)
print("Best hyperparameters:", study.best_params)
print("Best validation accuracy:", study.best_value)

[I 2025-03-10 02:23:41,767] A new study created in memory with name: study_2025-03-10_02-23-41


--------------------------------------------------
trial.number=0
batch_size: 64
optimizer_type: Adam
scheduler_type: OneCycleLR
weight_decay: 0.0001
learning_rate: 0.0006361471280328357
h_flip: 0.37619653334953174
rotation: 21
model_name: resnet18
trainable_parameters: 4903242
- - - - - - - - - - - - - - - - - - - - - - - - - 
  Epoch [1/20], Batch [700/704], Train Acc: 0.3321 Loss: 1.6405
  Validation Accuracy after Epoch 1: 0.4142
  Epoch [2/20], Batch [700/704], Train Acc: 0.4540 Loss: 1.5143
  Validation Accuracy after Epoch 2: 0.4498
  Epoch [3/20], Batch [700/704], Train Acc: 0.4957 Loss: 1.5282
  Validation Accuracy after Epoch 3: 0.5304
  Epoch [4/20], Batch [700/704], Train Acc: 0.5027 Loss: 1.6859
  Validation Accuracy after Epoch 4: 0.4598
  Epoch [5/20], Batch [700/704], Train Acc: 0.5141 Loss: 1.0712
  Validation Accuracy after Epoch 5: 0.5288
  Epoch [6/20], Batch [700/704], Train Acc: 0.5305 Loss: 1.3236
  Validation Accuracy after Epoch 6: 0.5288
  Epoch [7/20], Batch 

[I 2025-03-10 02:28:48,733] Trial 0 finished with value: 0.8034 and parameters: {'batch_size': 64, 'optimizer_type': 'Adam', 'scheduler_type': 'OneCycleLR', 'weight_decay': 0.0001, 'learning_rate': 0.0006361471280328357, 'h_flip': 0.37619653334953174, 'rotation': 21}. Best is trial 0 with value: 0.8034.


  Validation Accuracy after Epoch 20: 0.8034
Trial 0 complete. Best Validation Accuracy: 0.8034

Model checkpoint saved to checkpoints_study_2025-03-10_02-23-41/model_trial_0_val_acc_0.8034.pth
Best trial: 0
Best hyperparameters: {'batch_size': 64, 'optimizer_type': 'Adam', 'scheduler_type': 'OneCycleLR', 'weight_decay': 0.0001, 'learning_rate': 0.0006361471280328357, 'h_flip': 0.37619653334953174, 'rotation': 21}
Best validation accuracy: 0.8034


### Load Checkpoint

In [29]:
model = ResNet18()
model.to(device)

checkpoint_dir = f"checkpoints_{study_name}"
with open(os.path.join(checkpoint_dir, "study_details.json"), "r") as f:
    study_details = json.load(f)
best_checkpoint_fp = study_details[str(study.best_trial.number)]["checkpoint_path"]
# best_checkpoint_fp = ""

# Load the latest checkpoint
checkpoint = torch.load(best_checkpoint_fp)
model.load_state_dict(checkpoint)

<All keys matched successfully>

#### Test on test data

In [30]:
from trainer import evaluate_model
from data_loader import get_test_dataloader

test_loader = get_test_dataloader()
acc = evaluate_model(model, test_loader, device)
print("Acc:", acc)

Acc: 0.8394


#### Test on cifar10.1 subset

In [31]:
from cifar10_1_dataloader import get_dataloader_10_1
dataloader_10_1 = get_dataloader_10_1()

acc = evaluate_model(model, dataloader_10_1, device)
print("Acc:", acc)

Acc: 0.74


### Run model on Kaggle test data

In [28]:
from data_loader import get_kaggle_test_dataloader

In [29]:
# Generate submission file with test data
kaggle_test_loader = get_kaggle_test_dataloader()

model.eval()
predictions = []

with torch.no_grad():
    for images, in kaggle_test_loader:
        images = images.to(device)
        outputs = model(images) 
        _, predicted = torch.max(outputs, 1)
        predictions.extend(predicted.cpu().numpy())

# submission = pd.DataFrame({'ID': np.arange(len(predictions)), 'Labels': predictions})
# submission.to_csv('submission.csv', index=False)
# print("submission file saved.")

In [12]:
# import kaggle
# kaggle.api.competition_submit(
#     file_name="submission.csv",
#     message="0.9237",
#     competition=competition_name
# )