# Load libraries

In [None]:
! pip install optuna

Collecting optuna
  Downloading optuna-3.4.0-py3-none-any.whl (409 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m409.6/409.6 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.12.1-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.8/226.8 kB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.0-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.0 alembic-1.12.1 colorlog-6.7.0 optuna-3.4.0


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader
import optuna
import numpy as np

# Set the Device

You should determine if a GPU is available and set your device accordingly.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")

Using cuda device


# Load the CIFAR-10 dataset

As part of the image preprocessing pipeline for a neural network, we need to prepare the images for input into the model. We use these transformations to ensure that the input images are in the correct format, size, and value range for the neural network to process effectively.

In this specific case:

1. `transforms.Resize(224)`: Resize the input images to a size of 224x224 pixels. Many pre-trained models, like the ResNet architecture used in this example, are originally trained on the ImageNet dataset, where the standard image size is 224x224.

2. `transforms.ToTensor()`: Convert the input images from PIL format or NumPy arrays to PyTorch tensors.

3. `transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))`: Normalize the pixel values of the images. In this case, the mean and standard deviation for each channel (Red, Green, Blue) are both set to 0.5. The purpose of normalization is to scale the pixel values to a range that helps with the convergence during training. Normalizing the data typically makes training more efficient and leads to faster convergence.

In [None]:
transform = transforms.Compose([
    transforms.Resize(224),  # Resize the images to 224x224
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

train_set = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_set, batch_size=64, shuffle=True, num_workers=2)

test_set = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
test_loader = DataLoader(test_set, batch_size=64, shuffle=False, num_workers=2)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:13<00:00, 12901852.66it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


# Defining the Neural Network Architecture with Transfer Learning
We'll use a pre-trained ResNet model and modify it for CIFAR-10

In [None]:
def create_model(dropout_rate):
    model = models.resnet18(pretrained=True)
    num_ftrs = model.fc.in_features
    model.fc = nn.Sequential(
        nn.Dropout(dropout_rate),
        nn.Linear(num_ftrs, 10) # CIFAR-10 has 10 classes
    )
    return model.to(device)

# Define useful functions

In [None]:
def train_model(model, train_loader, optimizer, criterion):
    # Sets the model in training mode.
    model.train()
    total_loss = 0
    # Iterates over training data.
    for data, target in train_loader:
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()  # Resets gradients to zero before starting backpropagation.
        output = model(data)  # Forward pass
        loss = criterion(output, target)  # Calculate loss
        loss.backward()  # Backpropagation
        optimizer.step()  # Update model parameters
        total_loss += loss.item()  # Sum up the loss
    return total_loss / len(train_loader)  # Return average loss

In [None]:
def validate_model(model, test_loader, criterion):
    # Sets the model in evaluation mode.
    model.eval()
    total_loss = 0
    # Disables gradient calculations for validations.
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            loss = criterion(output, target)
            total_loss += loss.item()
    return total_loss / len(test_loader)  # Return average validation loss

# Setting up Hyperparameter Grid with Optuna

How to pick the values for hyperparameters?

1. Learning rates
  - Common values range between 0.1 and 0.0001.
  - Starting with values like 0.001 or 0.01 is common practice. These values are often a good starting point as they are not too large to cause divergence nor too small to slow down convergence significantly.
  - You might choose a range of values that decrease by an order of magnitude (e.g., 0.1, 0.01, 0.001) to explore how sensitive your model is to the learning rate.

2. Dropout rates
  - Typical values range from 0.1 to 0.5.
  - Starting with a moderate value like 0.2 or 0.3 can help gauge the effect of dropout on your specific model and dataset.

3. Weight Decays (L2 Regularization):
  - Common values are small, such as 0.0001, 0.001, or even 0.01, since the regularization term is added to the loss and can significantly influence the gradients if too large.

In [None]:
def objective(trial):
    # Suggest hyperparameters
    learning_rate = trial.suggest_float("lr", 0.001, 0.01)
    dropout_rate = trial.suggest_categorical("dropout_rate", [0.2])
    weight_decay = trial.suggest_categorical("weight_decay", [0.0001, 0.001])

    print(f"\nStarting Trial {trial.number}: lr={learning_rate}, dropout={dropout_rate}, weight_decay={weight_decay}")

    model = create_model(dropout_rate)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    criterion = nn.CrossEntropyLoss()

    # Training and validation loop
    for epoch in range(3):  # Number of epochs can be adjusted
        train_loss = train_model(model, train_loader, optimizer, criterion)
        val_loss = validate_model(model, test_loader, criterion)
        print(f"Epoch {epoch}: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

    # Save the model for this trial
    model_path = f"model_trial_{trial.number}.pth"
    torch.save(model.state_dict(), model_path)
    print(f"Model saved as {model_path}")

    return val_loss

In [None]:
# This function trains the model and employs early stopping if the validation loss does not improve.
def train_and_evaluate_with_early_stopping(model, train_loader, test_loader, optimizer, criterion, epochs=5):
    best_val_loss = float('inf')
    early_stopping_patience = 5
    patience_counter = 0

    for epoch in range(epochs):
        train_loss = train_model(model, train_loader, optimizer, criterion)  # Training
        val_loss = validate_model(model, test_loader, criterion)  # Validation

        # Print loss for every epoch
        print(f"Epoch {epoch}, Train Loss: {train_loss}, Validation Loss: {val_loss}")

        # Check if validation loss improved
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), "best_model.pth")  # Save best model
        else:
            patience_counter += 1  # Increment patience counter

        # Check for early stopping
        if patience_counter >= early_stopping_patience:
            print("Early stopping triggered.")
            break

    # Load the best model
    model.load_state_dict(torch.load("best_model.pth"))

In [None]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=1)

[I 2023-11-22 14:29:10,184] A new study created in memory with name: no-name-23c879e8-6f53-4f19-9b08-9fbaa0c533be



Starting Trial 0: lr=0.003994808316121756, dropout=0.2, weight_decay=0.001


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 172MB/s]


Epoch 0: Train Loss: 1.4975, Val Loss: 1.9459
Epoch 1: Train Loss: 1.1574, Val Loss: 1.4262


[I 2023-11-22 14:38:27,212] Trial 0 finished with value: 1.0398335984558056 and parameters: {'lr': 0.003994808316121756, 'dropout_rate': 0.2, 'weight_decay': 0.001}. Best is trial 0 with value: 1.0398335984558056.


Epoch 2: Train Loss: 0.9979, Val Loss: 1.0398
Model saved as model_trial_0.pth


# Identify Best Configuration and Load the Best Model
After hyperparameter tuning with Optuna, load the best performing model.

In [None]:
best_trial = study.best_trial
print(f"Best trial: {best_trial.number}")

model = create_model(best_trial.params['dropout_rate'])
model.load_state_dict(torch.load(f"model_trial_{best_trial.number}.pth"))

# Optionally, you can train the model with the best hyperparameters for more epochs
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=best_trial.params['lr'], weight_decay=best_trial.params['weight_decay'])
train_and_evaluate_with_early_stopping(model, train_loader, test_loader, optimizer, criterion)

Best trial: 0
Epoch 0, Train Loss: 0.893689848006229, Validation Loss: 1.0500298746072563
Epoch 1, Train Loss: 0.8427119655224978, Validation Loss: 1.3185259916220502
Epoch 2, Train Loss: 0.8113234102954645, Validation Loss: 1.161614367157031
Epoch 3, Train Loss: 0.7962137435555763, Validation Loss: 1.016058033818652
Epoch 4, Train Loss: 0.7717666113391861, Validation Loss: 0.8164087351720044


# Compute Classification Performance Metrics

In [None]:
def evaluate_performance(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data, targets in test_loader:
            data, targets = data.to(device), targets.to(device)
            outputs = model(data)
            _, predicted = torch.max(outputs.data, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()

    accuracy = 100 * correct / total
    print(f'Accuracy of the model on test images: {accuracy}%')

evaluate_performance(model, test_loader)

Accuracy of the model on test images: 71.44%


# Next you could...

- Save the model for deployment, load it to make prediction on new data... this could be part of a webapp, etc.
- Use GradCAM to understand what the model is using to make predictions
- Do Feature Visualization to understand what each layer is "seeing"