## 初めに
今回は課題指示書の「ハイパラメータチューニングに慣れる」という目的ベースで課題とは異なる順序で課題をまとめた.


具体的には各要素は守りつつハイパラメータ調整ライブラリoptunaに重点を置いたものとなっている．


このnotebookはCIFAR10を対象にしたCNN構築を一貫して行っている．


In [11]:
!pip install optuna
!pip install ipywidgets
!pip install pytorch_lightning


[0m

In [13]:
# NumPy、Matplotlib、PyTorchをインポートする
import numpy as np
import matplotlib.pyplot as plt
import torch
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim



BATCHSIZE = 4


transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

train_set = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_set, batch_size=BATCHSIZE, shuffle=True, num_workers=2)

test_set = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
test_loader = DataLoader(test_set, batch_size=BATCHSIZE, shuffle=False, num_workers=2)


classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified
Files already downloaded and verified


In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import optuna
optuna.logging.disable_default_handler()


#モデルの定義

#入力画像の高さと幅，畳み込み層のカーネルサイズ
in_height = 32
in_width = 32
kernel = 5
class Net(nn.Module):
  def __init__(self, trial, num_layer, mid_units, num_filters):
    super(Net, self).__init__()
    self.activation = get_activation(trial)
    #第1層
    self.convs = nn.ModuleList([nn.Conv2d(in_channels=3, out_channels=num_filters[0], kernel_size=5)])
    self.out_height = in_height - kernel +1
    self.out_width = in_width - kernel +1
    #第2層以降
    for i in range(1, num_layer):
      self.convs.append(nn.Conv2d(in_channels=num_filters[i-1], out_channels=num_filters[i], kernel_size=5))
      self.out_height = self.out_height - kernel + 1
      self.out_width = self.out_width - kernel +1
    #pooling層
    self.pool = nn.AvgPool2d(kernel_size=2, stride=2)
    self.out_height = int(self.out_height / 2)
    self.out_width = int(self.out_width / 2)
    #線形層
    self.out_feature = self.out_height * self.out_width * num_filters[num_layer - 1]
    self.fc1 = nn.Linear(in_features=self.out_feature, out_features=mid_units)
    self.fc2 = nn.Linear(in_features=mid_units, out_features=10)

  def forward(self, x):
    for i, l in enumerate(self.convs):
      x = l(x)
      x = self.activation(x)
    x = self.pool(x)
    x = x.view(-1, self.out_feature)
    x = self.fc1(x)
    x = self.fc2(x)
    return F.log_softmax(x, dim=1)

In [15]:
def train(model, device, train_loader, optimizer):
  model.train()
  for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()

def test(model, device, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in test_loader:
            inputs, labels = data[0].to(device), data[1].to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return 1 - correct / len(test_loader.dataset)

In [16]:
import torch.optim as optim

def get_optimizer(trial, model):
  optimizer_names = ['Adam', 'MomentumSGD', 'rmsprop']
  optimizer_name = trial.suggest_categorical('optimizer', optimizer_names)

  weight_decay = trial.suggest_loguniform('weight_decay', 1e-10, 1e-3)

  if optimizer_name == optimizer_names[0]:
    adam_lr = trial.suggest_loguniform('adam_lr', 1e-5, 1e-1)
    optimizer = optim.Adam(model.parameters(), lr=adam_lr, weight_decay=weight_decay)
  elif optimizer_name == optimizer_names[1]:
    momentum_sgd_lr = trial.suggest_loguniform('momentum_sgd_lr', 1e-5, 1e-1)
    optimizer = optim.SGD(model.parameters(), lr=momentum_sgd_lr, momentum=0.9, weight_decay=weight_decay)
  else:
    optimizer = optim.RMSprop(model.parameters())

  return optimizer

In [17]:
def get_activation(trial):
    activation_names = ['ReLU', 'ELU']
    activation_name = trial.suggest_categorical('activation', activation_names)

    if activation_name == activation_names[0]:
        activation = F.relu
    else:
        activation = F.elu

    return activation

In [18]:
EPOCH = 10
def objective(trial):
  device = "cuda" if torch.cuda.is_available() else "cpu"

  #畳み込み層の数
  num_layer = trial.suggest_int('num_layer', 3, 7)

  #FC層のユニット数
  mid_units = int(trial.suggest_discrete_uniform("mid_units", 10, 300, 10))

  #各畳込み層のフィルタ数
  num_filters = [int(trial.suggest_discrete_uniform("num_filter_"+str(i), 16, 128, 16)) for i in range(num_layer)]

  model = Net(trial, num_layer, mid_units, num_filters).to(device)
  optimizer = get_optimizer(trial, model)

  for step in range(EPOCH):
    train(model, device, train_loader, optimizer)
    error_rate = test(model, device, test_loader)
    print(f'{step}fin | error rate {error_rate}')
  print(f'{trial.number} trial fin')
  return error_rate

In [19]:

from optuna.integration.tensorboard import TensorBoardCallback

TRIAL_SIZE = 50
tensorboard_callback = TensorBoardCallback("logs/CIFAR10/", metric_name="accuracy")
study = optuna.create_study()
study.optimize(objective, n_trials=TRIAL_SIZE, callbacks=[tensorboard_callback])

  mid_units = int(trial.suggest_discrete_uniform("mid_units", 100, 500, 100))
  num_filters = [int(trial.suggest_discrete_uniform("num_filter_"+str(i), 16, 128, 16)) for i in range(num_layer)]
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-10, 1e-3)
  momentum_sgd_lr = trial.suggest_loguniform('momentum_sgd_lr', 1e-5, 1e-1)


0fin | error rate 0.9
1fin | error rate 0.9
2fin | error rate 0.9
3fin | error rate 0.9
4fin | error rate 0.9
5fin | error rate 0.9
6fin | error rate 0.9
7fin | error rate 0.9
8fin | error rate 0.9
9fin | error rate 0.9
0 trial fin
0fin | error rate 0.5567
1fin | error rate 0.47629999999999995
2fin | error rate 0.47719999999999996
3fin | error rate 0.44989999999999997
4fin | error rate 0.43820000000000003
5fin | error rate 0.40559999999999996
6fin | error rate 0.9
7fin | error rate 0.9
8fin | error rate 0.9
9fin | error rate 0.9
1 trial fin
0fin | error rate 0.9
1fin | error rate 0.9
2fin | error rate 0.9
3fin | error rate 0.9
4fin | error rate 0.9
5fin | error rate 0.9
6fin | error rate 0.9
7fin | error rate 0.9
8fin | error rate 0.9
9fin | error rate 0.9
2 trial fin
0fin | error rate 0.9
1fin | error rate 0.9
2fin | error rate 0.9
3fin | error rate 0.9
4fin | error rate 0.9
5fin | error rate 0.9
6fin | error rate 0.9
7fin | error rate 0.9
8fin | error rate 0.9
9fin | error rate 0.9
3

  adam_lr = trial.suggest_loguniform('adam_lr', 1e-5, 1e-1)


0fin | error rate 0.49129999999999996
1fin | error rate 0.4504
2fin | error rate 0.4142
3fin | error rate 0.41800000000000004
4fin | error rate 0.4284
5fin | error rate 0.42479999999999996
6fin | error rate 0.4202
7fin | error rate 0.4276
8fin | error rate 0.39470000000000005
9fin | error rate 0.39959999999999996
4 trial fin
0fin | error rate 0.5828
1fin | error rate 0.5084
2fin | error rate 0.4598
3fin | error rate 0.43200000000000005
4fin | error rate 0.4074
5fin | error rate 0.3881
6fin | error rate 0.37439999999999996
7fin | error rate 0.3721
8fin | error rate 0.3688
9fin | error rate 0.3649
5 trial fin
0fin | error rate 0.9
1fin | error rate 0.9
2fin | error rate 0.9
3fin | error rate 0.9
4fin | error rate 0.9
5fin | error rate 0.9
6fin | error rate 0.9
7fin | error rate 0.9
8fin | error rate 0.9
9fin | error rate 0.9
6 trial fin
0fin | error rate 0.9
1fin | error rate 0.9
2fin | error rate 0.9
3fin | error rate 0.9
4fin | error rate 0.9
5fin | error rate 0.9
6fin | error rate 0.9

## まとめ
以上の実行中のハイパラメータは指定範囲の中から選ばれたものを採用しているだけのため, 精度があまり芳しくない.

## 結果
ここから以上で設定したハイパラは以下のものが行った思考の中で最適であると判断された．

In [20]:
study.best_params

{'num_layer': 5,
 'mid_units': 100.0,
 'num_filter_0': 80.0,
 'num_filter_1': 112.0,
 'num_filter_2': 48.0,
 'num_filter_3': 48.0,
 'num_filter_4': 80.0,
 'activation': 'ReLU',
 'optimizer': 'Adam',
 'weight_decay': 1.0280645412034008e-10,
 'adam_lr': 7.417964919007603e-05}

In [21]:
study.best_value

0.25770000000000004

## 最適パラメータ結果との比較
精度としてハイパラメータ最適化を行ったネットワークは性能が高く，チューニングを行っていないネットワークに関しては非常に悪い結果となっていることがわかる．


このパラメータ指定を行う際にはこのディレクトリ内に存在する.py を実行することで特定パラメータにおける性能評価を実現できる．

このOptunaを用いたパラメータ探索に関しては ./Optuna_CIFAR10.py で実装を行ったそのため以下のコマンドを実行すると学習およびパラメータ探索が開始される．
上の行為は時間がかかるのでこちらも筆者が実行した結果をtesorboardで確認する．

以下の写真からわかることは各パラーメータ値とそのパラメータ設定で推論を行った時の精度である．

errorrateと他パラメータの関係として以下のことが挙げられる


- 
- 
- 

以上のことから_________________________ということが言える．

Optunaには最適パラメータを保持する機能があり，その保持パラーメータとそのハイパラで組まれたネットワークのerror rateが出力できる．


この結果より発見されたパラメータは


ーーー


であり，そのネットワークでのerror rateはーーー


調整前よりも性能が上がっていることが確認できる．

## パラメータ指定を行う場合

パラメータ保存を行わない場合

In [2]:
# NumPy、Matplotlib、PyTorchをインポートする
import datetime
import numpy as np
import matplotlib.pyplot as plt
import ipdb
import random
import torch
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from optuna.integration.tensorboard import TensorBoardCallback


import optuna
optuna.logging.disable_default_handler()


#set seeds
def torch_fix_seed(seed=42):
    # Python random
    random.seed(seed)
    # Numpy
    np.random.seed(seed)
    # Pytorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.use_deterministic_algorithms = True


torch_fix_seed()


#入力画像の高さと幅，畳み込み層のカーネルサイズ
in_height = 32
in_width = 32
kernel = 5
BATCHSIZE = 4

transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

train_set = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_set, batch_size=BATCHSIZE, shuffle=True, num_workers=2)

test_set = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
test_loader = DataLoader(test_set, batch_size=BATCHSIZE, shuffle=False, num_workers=2)


classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')



class Net(nn.Module):
  def __init__(self, trial, num_layer, mid_units, num_filters):
    super(Net, self).__init__()
    self.activation = get_activation(trial)
    #第1層
    self.convs = nn.ModuleList([nn.Conv2d(in_channels=3, out_channels=num_filters[0], kernel_size=5)])
    self.out_height = in_height - kernel +1
    self.out_width = in_width - kernel +1
    #第2層以降
    for i in range(1, num_layer):
      self.convs.append(nn.Conv2d(in_channels=num_filters[i-1], out_channels=num_filters[i], kernel_size=5))
      self.out_height = self.out_height - kernel + 1
      self.out_width = self.out_width - kernel +1
    #pooling層
    self.pool = nn.AvgPool2d(kernel_size=2, stride=2)
    self.out_height = int(self.out_height / 2)
    self.out_width = int(self.out_width / 2)
    #線形層
    self.out_feature = self.out_height * self.out_width * num_filters[num_layer - 1]
    self.fc1 = nn.Linear(in_features=self.out_feature, out_features=mid_units)
    self.fc2 = nn.Linear(in_features=mid_units, out_features=10)

  def forward(self, x):
    for i, l in enumerate(self.convs):
      x = l(x)
      x = self.activation(x)
    x = self.pool(x)
    x = x.view(-1, self.out_feature)
    x = self.fc1(x)
    x = self.fc2(x)
    return F.log_softmax(x, dim=1)

def train(model, device, train_loader, optimizer):
  model.train()
  loss_corect = 0
  for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()  
        loss_corect+=loss
  return loss_corect / len(train_loader)

def test(model, device, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in test_loader:
            inputs, labels = data[0].to(device), data[1].to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return 1 - correct / len(test_loader.dataset)

def get_optimizer(trial, model):
  optimizer_names = ['Adam', 'MomentumSGD', 'rmsprop']
  optimizer_name = trial.suggest_categorical('optimizer', optimizer_names)

  weight_decay = trial.suggest_loguniform('weight_decay', 1e-10, 1e-3)

  if optimizer_name == optimizer_names[0]:
    adam_lr = trial.suggest_loguniform('adam_lr', 1e-5, 1e-1)
    optimizer = optim.Adam(model.parameters(), lr=adam_lr, weight_decay=weight_decay)
  elif optimizer_name == optimizer_names[1]:
    momentum_sgd_lr = trial.suggest_loguniform('momentum_sgd_lr', 1e-5, 1e-1)
    optimizer = optim.SGD(model.parameters(), lr=momentum_sgd_lr, momentum=0.9, weight_decay=weight_decay)
  else:
    optimizer = optim.RMSprop(model.parameters())

  return optimizer

def get_activation(trial):
    activation_names = ['ReLU', 'ELU']
    activation_name = trial.suggest_categorical('activation', activation_names)

    if activation_name == activation_names[0]:
        activation = F.relu
    else:
        activation = F.elu

    return activation

def objective(trial):
  # writer = SummaryWriter(log_dir=f"logs/CIFAR10/{datetime.datetime.now()}/learning/trial_{trial.number}/")
  EPOCH = 10
  device = "cuda" if torch.cuda.is_available() else "cpu"

  #畳み込み層の数
  num_layer = trial.suggest_int('num_layer', 3, 7)

  #FC層のユニット数
  mid_units = int(trial.suggest_discrete_uniform("mid_units", 100, 300, 10))

  #各畳込み層のフィルタ数
  num_filters = [int(trial.suggest_discrete_uniform("num_filter_"+str(i), 16, 128, 16)) for i in range(num_layer)]

  model = Net(trial, num_layer, mid_units, num_filters).to(device)
  optimizer = get_optimizer(trial, model)

  for step in range(EPOCH):
    loss=train(model, device, train_loader, optimizer)
    error_rate = test(model, device, test_loader)
    # writer.add_scalar("loss", loss, step)  
    # writer.add_scalar("accuracy", error_rate, step)  
    print(f'{step}fin | error rate {error_rate}')

  print(f'{trial.number} trial fin')
  return error_rate

TRIAL_SIZE = 10
tensorboard_callback = TensorBoardCallback(f"logs/CIFAR10/{datetime.datetime.now()}/Optuna/", metric_name="error_rate")
study = optuna.create_study()
study.optimize(objective, n_trials=TRIAL_SIZE, callbacks=[tensorboard_callback])

print(study.best_params)
print(study.best_value)

best_params_result = study.best_params

#output
#{'num_layer': 4, 'mid_units': 140.0, 'num_filter_0': 128.0, 'num_filter_1': 112.0, 'num_filter_2': 112.0, 'num_filter_3': 112.0, 'activation': 'ReLU', 'optimizer': 'MomentumSGD', 'weight_decay': 5.2182135446336915e-08, 'momentum_sgd_lr': 0.0004955865902351846}
#0.2519


Files already downloaded and verified
Files already downloaded and verified


  tensorboard_callback = TensorBoardCallback(f"logs/CIFAR10/{datetime.datetime.now()}/Optuna/", metric_name="error_rate")
  mid_units = int(trial.suggest_discrete_uniform("mid_units", 100, 300, 10))
  num_filters = [int(trial.suggest_discrete_uniform("num_filter_"+str(i), 16, 128, 16)) for i in range(num_layer)]
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-10, 1e-3)
  adam_lr = trial.suggest_loguniform('adam_lr', 1e-5, 1e-1)


0fin | error rate 0.5188999999999999
1fin | error rate 0.4104
2fin | error rate 0.37570000000000003
3fin | error rate 0.3316
4fin | error rate 0.33089999999999997
5fin | error rate 0.3075
6fin | error rate 0.33240000000000003
7fin | error rate 0.31989999999999996
8fin | error rate 0.3173
9fin | error rate 0.3183
0 trial fin


2023-07-11 19:57:48.118020: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-07-11 19:57:48.118547: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-07-11 19:57:48.118614: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

0fin | error rate 0.9
1fin | error rate 0.9
2fin | error rate 0.9
3fin | error rate 0.9
4fin | error rate 0.9
5fin | error rate 0.9
6fin | error rate 0.9
7fin | error rate 0.9
8fin | error rate 0.9
9fin | error rate 0.9
1 trial fin


  momentum_sgd_lr = trial.suggest_loguniform('momentum_sgd_lr', 1e-5, 1e-1)


0fin | error rate 0.8513999999999999
1fin | error rate 0.8061
2fin | error rate 0.7518
3fin | error rate 0.7156
4fin | error rate 0.6796
5fin | error rate 0.6667000000000001
6fin | error rate 0.6517999999999999
7fin | error rate 0.6427
8fin | error rate 0.627
9fin | error rate 0.6041000000000001
2 trial fin
0fin | error rate 0.8359
1fin | error rate 0.7533
2fin | error rate 0.7089
3fin | error rate 0.679
4fin | error rate 0.655
5fin | error rate 0.6426000000000001
6fin | error rate 0.6304000000000001
7fin | error rate 0.6201
8fin | error rate 0.6109
9fin | error rate 0.6066
3 trial fin
0fin | error rate 0.9
1fin | error rate 0.9
2fin | error rate 0.8543000000000001
3fin | error rate 0.837
4fin | error rate 0.7229
5fin | error rate 0.6458999999999999
6fin | error rate 0.5924
7fin | error rate 0.5397000000000001
8fin | error rate 0.5122
9fin | error rate 0.4737
4 trial fin
0fin | error rate 0.9
1fin | error rate 0.9
2fin | error rate 0.9
3fin | error rate 0.9
4fin | error rate 0.9
5fin |