# MNIST画像認識のtorch model ハイパーパラメータチューニング
それでは，実際にMNISTの画像認識を通してハイパーパラメータのチューニングを行いたいと思います．

### 実行環境???????
- 

### チューニングを行うハイパーパラメータ
- 畳み込み層の数（3 ~ 7）
- 各畳み込み層のフィルタ数（16, 32, 48, ..., 128）
- 全結合層のユニット数（100, 200, 300, 400, 500）
- 活性化関数（ReLU, ELU）
- 最適化手法（Adam, MomentumSGD, rmsprop）
- 学習率（adam_lr(1e-10 ~ 1e-3), momentum_sgd_lr(1e-5 ~ 1e-1)）
- weight_decay（1e-10 ~ 1e-3）

## パラメータチューニングを楽にする方法
上のハイパラをチューニングすることで良い性能が得られる可能性がある.
しかしハイパラメータのチューニングはネットワークの性質と元データやタスクに関する情報から推測されるものであり、自動化を行わなければ複数結果を比較して最善と思われるものを自身で決定しなければならない. 
その個人的な意志力が介在するのはデータ分析的には好ましくないために自動化する必要がある．
そこで今回はOptunaを採用し、パイパラチューニングを効率的に行う．



## Load Dataset

In [3]:
import datetime
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
from torchvision import transforms
import numpy as np
import random
import ipdb
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from optuna.integration.tensorboard import TensorBoardCallback
import optuna
optuna.logging.disable_default_handler()


#set seeds
def torch_fix_seed(seed=42):
    # Python random
    random.seed(seed)
    # Numpy
    np.random.seed(seed)
    # Pytorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.use_deterministic_algorithms = True

torch_fix_seed()


BATCHSIZE = 128

transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])

train_set = MNIST(root='./data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_set, batch_size=BATCHSIZE, shuffle=True, num_workers=2)

test_set = MNIST(root='./data', train=False, download=True, transform=transform)
test_loader = DataLoader(test_set, batch_size=BATCHSIZE, shuffle=False, num_workers=2)

classes = tuple(np.linspace(0, 9, 10, dtype=np.uint8))


## Define Model

In [4]:
#モデルの定義

#入力画像の高さと幅，畳み込み層のカーネルサイズ
in_height = 28
in_width = 28
kernel = 3

class Net(nn.Module):
  def __init__(self, trial, num_layer, mid_units, num_filters):
    super(Net, self).__init__()
    self.activation = get_activation(trial)
    #第1層
    self.convs = nn.ModuleList([nn.Conv2d(in_channels=1, out_channels=num_filters[0], kernel_size=3)])
    self.out_height = in_height - kernel +1
    self.out_width = in_width - kernel +1
    #第2層以降
    for i in range(1, num_layer):
      self.convs.append(nn.Conv2d(in_channels=num_filters[i-1], out_channels=num_filters[i], kernel_size=3))
      self.out_height = self.out_height - kernel + 1
      self.out_width = self.out_width - kernel +1
    #pooling層
    self.pool = nn.AvgPool2d(kernel_size=2, stride=2)
    self.out_height = int(self.out_height / 2)
    self.out_width = int(self.out_width / 2)
    #線形層
    self.out_feature = self.out_height * self.out_width * num_filters[num_layer - 1]
    self.fc1 = nn.Linear(in_features=self.out_feature, out_features=mid_units) 
    self.fc2 = nn.Linear(in_features=mid_units, out_features=10)
    
  def forward(self, x):
    for i, l in enumerate(self.convs):
      x = l(x)
      x = self.activation(x)
    x = self.pool(x)
    x = x.view(-1, self.out_feature)
    x = self.fc1(x)
    x = self.fc2(x)
    return F.log_softmax(x, dim=1)

## Train & Test

In [5]:
def train(model, device, train_loader, optimizer):
  model.train()
  for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        
def test(model, device, test_loader):
    model.eval()
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            pred = output.max(1, keepdim=True)[1]
            correct += pred.eq(target.view_as(pred)).sum().item()
    return 1 - correct / len(test_loader.dataset)


## Tune Optimaisation

In [6]:
def get_optimizer(trial, model):
  optimizer_names = ['Adam', 'MomentumSGD', 'rmsprop']
  optimizer_name = trial.suggest_categorical('optimizer', optimizer_names)
  
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-10, 1e-3)
  
  if optimizer_name == optimizer_names[0]: 
    adam_lr = trial.suggest_loguniform('adam_lr', 1e-5, 1e-1)
    optimizer = optim.Adam(model.parameters(), lr=adam_lr, weight_decay=weight_decay)
  elif optimizer_name == optimizer_names[1]:
    momentum_sgd_lr = trial.suggest_loguniform('momentum_sgd_lr', 1e-5, 1e-1)
    optimizer = optim.SGD(model.parameters(), lr=momentum_sgd_lr, momentum=0.9, weight_decay=weight_decay)
  else:
    optimizer = optim.RMSprop(model.parameters())
  
  return optimizer

## Tune activate function

In [7]:
def get_activation(trial):
    activation_names = ['ReLU', 'ELU']
    activation_name = trial.suggest_categorical('activation', activation_names)
    
    if activation_name == activation_names[0]:
        activation = F.relu
    else:
        activation = F.elu
    
    return activation


## Set the object function

In [16]:
EPOCH = 10
def objective(trial):
  device = "cuda" if torch.cuda.is_available() else "cpu"
  print(f'device == {device}')
  
  #畳み込み層の数
  num_layer = trial.suggest_int('num_layer', 3, 7)
  
  #FC層のユニット数
  mid_units = int(trial.suggest_discrete_uniform("mid_units", 100, 500, 100))
  
  #各畳込み層のフィルタ数
  num_filters = [int(trial.suggest_discrete_uniform("num_filter_"+str(i), 16, 128, 16)) for i in range(num_layer)]
  
  model = Net(trial, num_layer, mid_units, num_filters).to(device)
  optimizer = get_optimizer(trial, model)
  
  for step in range(EPOCH):
    train(model, device, train_loader, optimizer)
    error_rate = test(model, device, test_loader)
    print(f'{step}fin | error rate {error_rate}')

  print(f'{trial.number + 1} trial fin')
  return error_rate



## Execution

In [18]:
from optuna.integration.tensorboard import TensorBoardCallback

TRIAL_SIZE = 10
tensorboard_callback = TensorBoardCallback(f"logs/MNIST/{datetime.datetime.now()}/Optuna/", metric_name="error_rate")
study = optuna.create_study()
study.optimize(objective, n_trials=TRIAL_SIZE, callbacks=[tensorboard_callback])
# ipdb.set_trace()

print(study.best_params)
print(study.best_value)


device == cuda


  mid_units = int(trial.suggest_discrete_uniform("mid_units", 100, 500, 100))
  num_filters = [int(trial.suggest_discrete_uniform("num_filter_"+str(i), 16, 128, 16)) for i in range(num_layer)]
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-10, 1e-3)
  adam_lr = trial.suggest_loguniform('adam_lr', 1e-5, 1e-1)


0fin | error rate 0.08819999999999995
1fin | error rate 0.05600000000000005
2fin | error rate 0.037699999999999956
3fin | error rate 0.02729999999999999
4fin | error rate 0.02410000000000001
5fin | error rate 0.020199999999999996
6fin | error rate 0.01849999999999996
7fin | error rate 0.01770000000000005
8fin | error rate 0.016100000000000003
9fin | error rate 0.017299999999999982
1 trial fin
device == cuda
0fin | error rate 0.036599999999999966
1fin | error rate 0.029299999999999993
2fin | error rate 0.024399999999999977
3fin | error rate 0.023700000000000054
4fin | error rate 0.019199999999999995
5fin | error rate 0.024700000000000055
6fin | error rate 0.02310000000000001
7fin | error rate 0.023599999999999954
8fin | error rate 0.028699999999999948
9fin | error rate 0.026499999999999968
2 trial fin
device == cuda


  momentum_sgd_lr = trial.suggest_loguniform('momentum_sgd_lr', 1e-5, 1e-1)


0fin | error rate 0.08589999999999998
1fin | error rate 0.06730000000000003
2fin | error rate 0.054200000000000026
3fin | error rate 0.040000000000000036
4fin | error rate 0.026699999999999946
5fin | error rate 0.02070000000000005
6fin | error rate 0.017900000000000027
7fin | error rate 0.017199999999999993
8fin | error rate 0.01859999999999995
9fin | error rate 0.016100000000000003
3 trial fin
device == cuda
0fin | error rate 0.024399999999999977
1fin | error rate 0.014399999999999968
2fin | error rate 0.012700000000000045
3fin | error rate 0.01319999999999999
4fin | error rate 0.011099999999999999
5fin | error rate 0.013499999999999956
6fin | error rate 0.012299999999999978
7fin | error rate 0.01200000000000001
8fin | error rate 0.012499999999999956
9fin | error rate 0.01319999999999999
4 trial fin
device == cuda
0fin | error rate 0.7583
1fin | error rate 0.7958000000000001
2fin | error rate 0.7368
3fin | error rate 0.5684
4fin | error rate 0.18169999999999997
5fin | error rate 0.131

このOptunaを用いたパラメータ探索に関しては ./Optuna_MNIST.py で実装を行ったそのため以下のコマンドを実行すると学習およびパラメータ探索が開始される．
結果はtensorbord上で確認できるため
以下のコマンドを実行する必要がある
```
tensorboard --logdir /root/src/logs/MNIST/<実行した時刻>/Optuna --host 0.0.0.0 --port 6006
```
そしてtensorboardの起動を確認後、左上のタブにおいてHPARAMSという欄がある.この欄を選択すると画面が切り替わる. その後, 画面中央のPARALLEL COORDINATES VIEWを選択することで下記の画像を確認できる. 
このグラフは各ハイパラメータと結果の関係をグラフ化したもので，このグラフから各ハイパラメータがどのように結果に対して寄与しているのかを確認できる

##写真を挿入

以上の写真からわかることは各パラーメータ値とそのパラメータ設定で推論を行った時の精度である．
errorrateと他パラメータの関係として以下のことが挙げられる

- 
- 
- 

以上のことから_________________________ということが言える．


Optunaには最適パラメータを保持する機能があり，その保持パラーメータとそのハイパラで組まれたネットワークのerror rateが出力できる．
この結果より発見されたパラメータは
ーーー
であり，そのネットワークでのerror rateはーーー


調整前よりも性能が上がっていることが確認できる．

|

## コード全文

In [1]:
import datetime
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
from torchvision import transforms
import numpy as np
import random
import ipdb
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from optuna.integration.tensorboard import TensorBoardCallback
import optuna
optuna.logging.disable_default_handler()


#set seeds
def torch_fix_seed(seed=42):
    # Python random
    random.seed(seed)
    # Numpy
    np.random.seed(seed)
    # Pytorch
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.use_deterministic_algorithms = True

torch_fix_seed()


BATCHSIZE = 128

transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])

train_set = MNIST(root='./data', train=True, download=True, transform=transform)
train_loader = DataLoader(train_set, batch_size=BATCHSIZE, shuffle=True, num_workers=2)

test_set = MNIST(root='./data', train=False, download=True, transform=transform)
test_loader = DataLoader(test_set, batch_size=BATCHSIZE, shuffle=False, num_workers=2)

classes = tuple(np.linspace(0, 9, 10, dtype=np.uint8))




#モデルの定義

#入力画像の高さと幅，畳み込み層のカーネルサイズ
in_height = 28
in_width = 28
kernel = 3

class Net(nn.Module):
  def __init__(self, trial, num_layer, mid_units, num_filters):
    super(Net, self).__init__()
    self.activation = get_activation(trial)
    #第1層
    self.convs = nn.ModuleList([nn.Conv2d(in_channels=1, out_channels=num_filters[0], kernel_size=3)])
    self.out_height = in_height - kernel +1
    self.out_width = in_width - kernel +1
    #第2層以降
    for i in range(1, num_layer):
      self.convs.append(nn.Conv2d(in_channels=num_filters[i-1], out_channels=num_filters[i], kernel_size=3))
      self.out_height = self.out_height - kernel + 1
      self.out_width = self.out_width - kernel +1
    #pooling層
    self.pool = nn.AvgPool2d(kernel_size=2, stride=2)
    self.out_height = int(self.out_height / 2)
    self.out_width = int(self.out_width / 2)
    #線形層
    self.out_feature = self.out_height * self.out_width * num_filters[num_layer - 1]
    self.fc1 = nn.Linear(in_features=self.out_feature, out_features=mid_units) 
    self.fc2 = nn.Linear(in_features=mid_units, out_features=10)
    
  def forward(self, x):
    for i, l in enumerate(self.convs):
      x = l(x)
      x = self.activation(x)
    x = self.pool(x)
    x = x.view(-1, self.out_feature)
    x = self.fc1(x)
    x = self.fc2(x)
    return F.log_softmax(x, dim=1)


def train(model, device, train_loader, optimizer):
  model.train()
  for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        
def test(model, device, test_loader):
    model.eval()
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            pred = output.max(1, keepdim=True)[1]
            correct += pred.eq(target.view_as(pred)).sum().item()
    return 1 - correct / len(test_loader.dataset)


def get_optimizer(trial, model):
  optimizer_names = ['Adam', 'MomentumSGD', 'rmsprop']
  optimizer_name = trial.suggest_categorical('optimizer', optimizer_names)
  
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-10, 1e-3)
  
  if optimizer_name == optimizer_names[0]: 
    adam_lr = trial.suggest_loguniform('adam_lr', 1e-5, 1e-1)
    optimizer = optim.Adam(model.parameters(), lr=adam_lr, weight_decay=weight_decay)
  elif optimizer_name == optimizer_names[1]:
    momentum_sgd_lr = trial.suggest_loguniform('momentum_sgd_lr', 1e-5, 1e-1)
    optimizer = optim.SGD(model.parameters(), lr=momentum_sgd_lr, momentum=0.9, weight_decay=weight_decay)
  else:
    optimizer = optim.RMSprop(model.parameters())
  
  return optimizer

def get_activation(trial):
    activation_names = ['ReLU', 'ELU']
    activation_name = trial.suggest_categorical('activation', activation_names)
    
    if activation_name == activation_names[0]:
        activation = F.relu
    else:
        activation = F.elu
    
    return activation

EPOCH = 10
def objective(trial):
  device = "cuda" if torch.cuda.is_available() else "cpu"
  print(f'device == {device}')
  
  #畳み込み層の数
  num_layer = trial.suggest_int('num_layer', 3, 7)
  
  #FC層のユニット数
  mid_units = int(trial.suggest_discrete_uniform("mid_units", 100, 500, 100))
  
  #各畳込み層のフィルタ数
  num_filters = [int(trial.suggest_discrete_uniform("num_filter_"+str(i), 16, 128, 16)) for i in range(num_layer)]
  
  model = Net(trial, num_layer, mid_units, num_filters).to(device)
  optimizer = get_optimizer(trial, model)
  
  for step in range(EPOCH):
    train(model, device, train_loader, optimizer)
    error_rate = test(model, device, test_loader)
    print(f'{step}fin | error rate {error_rate}')

  print(f'{trial.number + 1} trial fin')
  return error_rate


TRIAL_SIZE = 10
tensorboard_callback = TensorBoardCallback(f"logs/MNIST/{datetime.datetime.now()}/Optuna/", metric_name="error_rate")
study = optuna.create_study()
study.optimize(objective, n_trials=TRIAL_SIZE, callbacks=[tensorboard_callback])
# ipdb.set_trace()

print(study.best_params)
print(study.best_value)


2023-07-11 19:07:57.941408: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-07-11 19:07:57.967528: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  tensorboard_callback = TensorBoardCallback(f"logs/MNIST/{datetime.datetime.now()}/Optuna/", metric_name="error_rate")
  mid_units = int(trial.suggest_discrete_uniform("mid_units", 100, 500, 100))
  num_filters = [int(trial.suggest_discrete_uniform("num_filter_"+str(i), 16, 128, 16)) for i in range(num_layer)]


device == cuda


  weight_decay = trial.suggest_loguniform('weight_decay', 1e-10, 1e-3)
  momentum_sgd_lr = trial.suggest_loguniform('momentum_sgd_lr', 1e-5, 1e-1)


0fin | error rate 0.03180000000000005
1fin | error rate 0.02090000000000003
2fin | error rate 0.027599999999999958
3fin | error rate 0.014900000000000024
4fin | error rate 0.014900000000000024
5fin | error rate 0.014100000000000001
6fin | error rate 0.013000000000000012
7fin | error rate 0.013299999999999979
8fin | error rate 0.012499999999999956
9fin | error rate 0.0131
1 trial fin
device == cuda


2023-07-11 19:09:28.473729: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-07-11 19:09:28.474248: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-07-11 19:09:28.474317: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:995] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

0fin | error rate 0.08660000000000001
1fin | error rate 0.07169999999999999
2fin | error rate 0.05489999999999995
3fin | error rate 0.03939999999999999
4fin | error rate 0.03269999999999995
5fin | error rate 0.02739999999999998
6fin | error rate 0.026100000000000012
7fin | error rate 0.02300000000000002
8fin | error rate 0.020399999999999974
9fin | error rate 0.01870000000000005
2 trial fin
device == cuda
0fin | error rate 0.899
1fin | error rate 0.899
2fin | error rate 0.8865
3fin | error rate 0.8865
4fin | error rate 0.8865
5fin | error rate 0.8865
6fin | error rate 0.8865
7fin | error rate 0.899
8fin | error rate 0.8865
9fin | error rate 0.8865
3 trial fin
device == cuda
0fin | error rate 0.9108
1fin | error rate 0.9108
2fin | error rate 0.9117
3fin | error rate 0.906
4fin | error rate 0.8976999999999999
5fin | error rate 0.9028
6fin | error rate 0.9026
7fin | error rate 0.9026
8fin | error rate 0.9026
9fin | error rate 0.9026
4 trial fin
device == cuda
0fin | error rate 0.8972
1fin