#  [Optuna](https://optuna.readthedocs.io/en/latest/tutorial/index.html): An Automatic Hyperparameter Optimization Software
By [Zahra Taheri](https://github.com/zata213), December 11, 2020

## Some good resources and references:
- [Optuna tutorial](https://optuna.readthedocs.io/en/latest/tutorial/index.html)
- [Hyper-parameter optimization with Optuna](https://towardsdatascience.com/https-medium-com-perlitz-hyper-parameter-optimization-with-optuna-1c32785e7df)
- [How We Implement Hyperband in Optuna](https://tech.preferred.jp/en/blog/how-we-implement-hyperband-in-optuna/)
- [Using Optuna to Optimize PyTorch Hyperparameters](https://medium.com/pytorch/using-optuna-to-optimize-pytorch-hyperparameters-990607385e36)

## Optuna installation:
To install this package with conda, run one of the following:
- `conda install -c conda-forge optuna`
- `conda install -c conda-forge/ label/cf202003 optuna`

To install this package with pip, run:
- `pip install optuna`

In [None]:
#!pip install optuna

In [None]:
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline

import time

import torch
import torchvision
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch.autograd import Variable
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torchsummary import summary

np.random.seed(42)
torch.manual_seed(123)

import warnings
warnings.filterwarnings('ignore')

In [None]:
transform = transforms.Compose([transforms.Resize(256),
                                transforms.CenterCrop(224),
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.4915, 0.4823, 0.4468], std=[0.2470, 0.2435, 0.2616])])

trainset = datasets.CIFAR10(root='./data',
                                   train=True,
                                   transform=transform,
                                   download=True)

testset = datasets.CIFAR10(root='./data',
                                   train=False,
                                   transform=transform,
                                   download=True)

Files already downloaded and verified
Files already downloaded and verified


In [None]:
class_names = ['airplane','automobile','bird','cat','deer',
               'dog','frog','horse','ship','truck']

In [None]:
label_map = {1: 0, 7: 1}
class_names = ['automobile', 'horse']
cifar2 = [(img, label_map[label])
          for img, label in trainset
          if label in [1, 7]]
cifar2_val = [(img, label_map[label])
              for img, label in testset
              if label in [1, 7]]

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
criterion = nn.CrossEntropyLoss()

In [None]:
def compute_accuracy(net, data_loader):
    correct_pred, num_examples = 0, 0
    with torch.no_grad():
        for features, targets in data_loader:
            features = features.to(device)
            targets = targets.to(device)
            outputs = net.forward(features)
            predicted_labels = torch.argmax(outputs, 1)
            num_examples += targets.size(0)
            correct_pred += (predicted_labels == targets).sum()
        return correct_pred.float()/num_examples * 100        
            
            

def compute_loss(net, data_loader):
    curr_loss = 0.
    with torch.no_grad():
        for cnt, (features, targets) in enumerate(data_loader):
            features = features.to(device)
            targets = targets.to(device)
            outputs = net.forward(features)
            loss = criterion(outputs, targets)
            curr_loss += loss
        return float(curr_loss)/cnt 

In [None]:
def training(num_epochs, batch_size, optimizer, net):
  model = net
  train_loader = torch.utils.data.DataLoader(cifar2, batch_size=batch_size,
                                          shuffle=True, num_workers=4)
  test_loader = torch.utils.data.DataLoader(cifar2_val, batch_size=batch_size,
                                         shuffle=False, num_workers=4)
  minibatch_cost = []
  epoch_cost = []
  for epoch in range(num_epochs):
    model.train()
    for batch_idx, (features, targets) in enumerate(train_loader):
        
        features = Variable(features.to(device))
        targets = Variable(targets.to(device))
            
        ### FORWARD AND BACK PROP
        outputs = model(features)
        
        cost = criterion(outputs, targets)
        optimizer.zero_grad()
        
        cost.backward()
        minibatch_cost.append(cost)
        ### UPDATE MODEL PARAMETERS
        optimizer.step()
       
    model.eval()
    with torch.no_grad():
        cost = compute_loss(model, train_loader)
        epoch_cost.append(cost)
        
  model.eval()
  with torch.no_grad():
      return compute_accuracy(model, test_loader)

## Train the model with no hyperparameter optimization

In [None]:
class convnet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
        self.act1 = nn.Tanh()
        self.pool1 = nn.MaxPool2d(2)
        self.conv2 = nn.Conv2d(16, 8, kernel_size=3, padding=1)
        self.act2 = nn.Tanh()
        self.pool2 = nn.MaxPool2d(2)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(25088, 32)
        self.act3 = nn.Tanh()
        self.fc2 = nn.Linear(32, 2)

    def forward(self, x):
        out = self.pool1(self.act1(self.conv1(x)))
        out = self.pool2(self.act2(self.conv2(out)))
        out = self.flatten(out)
        out = self.act3(self.fc1(out))
        out = self.fc2(out)
        return out

In [None]:
model = convnet()
model.to(device)
summary(model, (3,224,224))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 16, 224, 224]             448
              Tanh-2         [-1, 16, 224, 224]               0
         MaxPool2d-3         [-1, 16, 112, 112]               0
            Conv2d-4          [-1, 8, 112, 112]           1,160
              Tanh-5          [-1, 8, 112, 112]               0
         MaxPool2d-6            [-1, 8, 56, 56]               0
           Flatten-7                [-1, 25088]               0
            Linear-8                   [-1, 32]         802,848
              Tanh-9                   [-1, 32]               0
           Linear-10                    [-1, 2]              66
Total params: 804,522
Trainable params: 804,522
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.57
Forward/backward pass size (MB): 15.70
Params size (MB): 3.07
Estimated 

In [None]:
learning_rate = 1e-2

optimizer = optim.SGD(model.parameters(), lr=learning_rate)

In [None]:
batch_size = 8
n_epochs = 5

training(n_epochs, batch_size, optimizer, model)

tensor(90.8000, device='cuda:0')

## Train the model with Optuna (an automatic hyperparameter optimization software)

In [None]:
def objective(trial):

    # generate the model
    model = ConvNet(trial).to(device)

    # generate the optimizers
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
    lr = trial.suggest_loguniform("lr", 1e-4, 1e-1)
    n_epochs = trial.suggest_int("num_epochs", 3, 5, 7)
    batch_size = trial.suggest_int("batch_size", 16, 32, 64)
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)
    
    accuracy = training(n_epochs, batch_size, optimizer, model)

    return accuracy

In [None]:
class ConvNet(nn.Module):
    def __init__(self, trial):
        super(ConvNet, self).__init__()
        self.conv1 = nn.Conv2d(3, trial.suggest_int('num_of_kernel_layer1', 8, 16), kernel_size=3, padding=1)
        self.act1 = nn.Tanh()
        self.pool1 = nn.MaxPool2d(2)
        self.conv2 = nn.Conv2d(trial.suggest_int('num_of_kernel_layer1', 8, 16), trial.suggest_int('num_of_kernel_layer2', 4, 8), kernel_size=3, padding=1)
        self.act2 = nn.Tanh()
        self.pool2 = nn.MaxPool2d(2)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(trial.suggest_int('num_of_kernel_layer2', 4, 8)*3136, 32)
        self.act3 = nn.Tanh()
        self.fc2 = nn.Linear(32, 2)
    def forward(self, x):
        out = self.pool1(self.act1(self.conv1(x)))
        out = self.pool2(self.act2(self.conv2(out)))
        out = self.flatten(out)
        out = self.act3(self.fc1(out))
        out = self.fc2(out)
        return out

In [None]:
import optuna
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5)

pruned_trials = [t for t in study.trials if t.state == optuna.structs.TrialState.PRUNED]
complete_trials = [t for t in study.trials if t.state == optuna.structs.TrialState.COMPLETE]

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2020-12-11 15:01:48,789][0m A new study created in memory with name: no-name-ed7a6eb8-0710-4d8d-ac80-48b86db615b3[0m
[32m[I 2020-12-11 15:02:48,941][0m Trial 0 finished with value: 66.1500015258789 and parameters: {'num_of_kernel_layer1': 12, 'num_of_kernel_layer2': 4, 'optimizer': 'Adam', 'lr': 0.025489786685226803, 'num_epochs': 3, 'batch_size': 16}. Best is trial 0 with value: 66.1500015258789.[0m
[32m[I 2020-12-11 15:03:48,249][0m Trial 1 finished with value: 89.35000610351562 and parameters: {'num_of_kernel_layer1': 11, 'num_of_kernel_layer2': 6, 'optimizer': 'SGD', 'lr': 0.004689450782069958, 'num_epochs': 3, 'batch_size': 16}. Best is trial 1 with value: 89.35000610351562.[0m
[32m[I 2020-12-11 15:04:44,196][0m Trial 2 finished with value: 57.45000076293945 and parameters: {'num_of_kernel_layer1': 8, 'num_of_kernel_layer2': 5, 'optimizer': 'RMSprop', 'lr': 0.09910475013534896, 'num_epochs': 3, 'batch_size': 16}. Best is trial 1 with value: 89.35000610351562.[0m

Study statistics: 
  Number of finished trials:  5
  Number of pruned trials:  0
  Number of complete trials:  5
Best trial:
  Value:  89.35000610351562
  Params: 
    num_of_kernel_layer1: 11
    num_of_kernel_layer2: 6
    optimizer: SGD
    lr: 0.004689450782069958
    num_epochs: 3
    batch_size: 16


## Train the model with Optuna using pruning

In [None]:
def objective(trial):

    # generate the model
    model = ConvNet(trial).to(device)

    # generate the optimizers
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
    lr = trial.suggest_loguniform("lr", 1e-4, 1e-1)
    n_epochs = trial.suggest_int("num_epochs", 30, 50, 70)
    batch_size = trial.suggest_int("batch_size", 16, 32, 64)
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)
    
    train_loader = torch.utils.data.DataLoader(cifar2, batch_size=batch_size,
                                          shuffle=True, num_workers=4)
    test_loader = torch.utils.data.DataLoader(cifar2_val, batch_size=batch_size,
                                         shuffle=False, num_workers=4)
    for epoch in range(n_epochs):
      model.train()
      for batch_idx, (features, targets) in enumerate(train_loader):
        features = Variable(features.to(device))
        targets = Variable(targets.to(device))
        outputs = model(features)
        cost = criterion(outputs, targets)
        optimizer.zero_grad()
        cost.backward()
        optimizer.step()
       
      model.eval()
      with torch.no_grad():
          epoch_acc = compute_accuracy(model, test_loader)
          trial.report(epoch_acc, epoch)
          if trial.should_prune():
              raise optuna.exceptions.TrialPruned()
        
    model.eval()
    with torch.no_grad():
        acc = compute_accuracy(model, test_loader)

    return acc

In [None]:
class ConvNet(nn.Module):
    def __init__(self, trial):
        super(ConvNet, self).__init__()
        self.conv1 = nn.Conv2d(3, trial.suggest_int('num_of_kernel_layer1', 8, 16), kernel_size=3, padding=1)
        self.act1 = nn.Tanh()
        self.pool1 = nn.MaxPool2d(2)
        self.conv2 = nn.Conv2d(trial.suggest_int('num_of_kernel_layer1', 8, 16), trial.suggest_int('num_of_kernel_layer2', 4, 8), kernel_size=3, padding=1)
        self.act2 = nn.Tanh()
        self.pool2 = nn.MaxPool2d(2)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(trial.suggest_int('num_of_kernel_layer2', 4, 8)*3136, 32)
        self.act3 = nn.Tanh()
        self.fc2 = nn.Linear(32, 2)
    def forward(self, x):
        out = self.pool1(self.act1(self.conv1(x)))
        out = self.pool2(self.act2(self.conv2(out)))
        out = self.flatten(out)
        out = self.act3(self.fc1(out))
        out = self.fc2(out)
        return out

In [None]:
study = optuna.create_study(direction="maximize", pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=50)

pruned_trials = [t for t in study.trials if t.state == optuna.structs.TrialState.PRUNED]
complete_trials = [t for t in study.trials if t.state == optuna.structs.TrialState.COMPLETE]

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2020-12-11 15:06:50,707][0m A new study created in memory with name: no-name-9da2bc2d-ffdc-4305-a472-55fded3afce4[0m
[32m[I 2020-12-11 15:14:28,441][0m Trial 0 finished with value: 91.60000610351562 and parameters: {'num_of_kernel_layer1': 12, 'num_of_kernel_layer2': 7, 'optimizer': 'Adam', 'lr': 0.0001637388912054928, 'num_epochs': 30, 'batch_size': 16}. Best is trial 0 with value: 91.60000610351562.[0m
[32m[I 2020-12-11 15:22:09,201][0m Trial 1 finished with value: 88.45000457763672 and parameters: {'num_of_kernel_layer1': 12, 'num_of_kernel_layer2': 8, 'optimizer': 'Adam', 'lr': 0.000764044999937247, 'num_epochs': 30, 'batch_size': 16}. Best is trial 0 with value: 91.60000610351562.[0m
[32m[I 2020-12-11 15:29:39,520][0m Trial 2 finished with value: 88.80001068115234 and parameters: {'num_of_kernel_layer1': 13, 'num_of_kernel_layer2': 4, 'optimizer': 'RMSprop', 'lr': 0.0008140351768055297, 'num_epochs': 30, 'batch_size': 16}. Best is trial 0 with value: 91.600006103

Study statistics: 
  Number of finished trials:  50
  Number of pruned trials:  34
  Number of complete trials:  16
Best trial:
  Value:  94.35000610351562
  Params: 
    num_of_kernel_layer1: 15
    num_of_kernel_layer2: 8
    optimizer: Adam
    lr: 0.00021960477910777773
    num_epochs: 30
    batch_size: 16
