# Optimizer

## スクラッチで実装

In [1]:
from torch import nn
import torch
import numpy as np
import matplotlib.pyplot as plt
from torch.nn import functional as F
from sklearn import datasets
from sklearn.model_selection import train_test_split

In [2]:
class MLP(nn.Module):
    
    def __init__(self, num_in, num_hidden, num_out):
        # 親のclassの継承
        super().__init__()
        self.l1 = nn.Linear(num_in, num_hidden)
        self.l2 = nn.Linear(num_hidden, num_out) # 出力層の全結合層

    # forwardの処理(順伝播の処理)
    def forward(self, x):
        x = self.l2(F.relu(self.l1(x)))
        return x

In [3]:
model = MLP(64, 30, 10)
model.parameters()  # generator

<generator object Module.parameters at 0x7ffefee9e820>

In [4]:
class Optimizer():

    def __init__(self, parameters, lr=0.03):
        self.parameters = list(parameters)
        self.lr = lr

    def step(self):
        with torch.no_grad():
            for param in self.parameters:
                param -= self.lr * param.grad

    def zero_grad(self):
        for param in self.parameters:
            if param.grad is not None:  # if文がないとgradが作られていないのに呼ばれてしまう
                param.grad_zero_()
                



In [5]:
class Optimizer():
    def __init__(self, parameters, lr=0.03):
        self.parameters = list(parameters)
        self.lr = lr

    def step(self):
        with torch.no_grad():
            for param in self.parameters:
                param -= self.lr * param.grad

    def zero_grad(self):
        for param in self.parameters:
            if param.grad is not None:
                param.grad.zero_()

# # Usage
# opt = Optimizer(model.parameters(), lr)  # Pass model.parameters() here


In [6]:
model = MLP(64, 30, 10)
model.parameters()  # generator

<generator object Module.parameters at 0x7ffefee9e900>

In [7]:
learning_rate = 0.01
opt = Optimizer(parameters = model.parameters(), lr=learning_rate)

In [11]:
# nnクラスとF.関数を組み合わせて作成する
# # MLP(親のクラス(継承する))
# class MLP(nn.Module):
    
#     def __init__(self, num_in, num_hidden, num_out):
#         # 親のclassの継承
#         super().__init__()
#         self.l1 = nn.Linear(num_in, num_hidden)
#         self.l2 = nn.Linear(num_hidden, num_out) # 出力層の全結合層

#     # forwardの処理(順伝播の処理)
#     def forward(self, x):
#         x = self.l2(F.relu(self.l1(x)))
#         return x

# class Optimizer():

#     def __init__(self, parameters, lr=0.03):
#         self.parameters = list(parameters)
#         self.lr = lr

#     def step(self):
#         with torch.no_grad():
#             for param in self.parameters():
#                 param -= self.lr * param.grad

#     def zero_grad(self):
#         for param in self.parameters:
#             if param.grad is not None:  # if文がないとgradが作られていないのに呼ばれてしまう
#                 param.grad_zero_()

## Refactoring後の学習ループ(OptimizerやDataset, Dataloaderの後にRefactaring)
# ===データの準備===
dataset = datasets.load_digits()
data = dataset['data']
target = dataset['target']
images = dataset['images']
X_train, X_val, y_train, y_val = train_test_split(images, target, test_size=0.2, random_state=42)
X_mean = X_train.mean()
X_std = X_train.std()

X_train = (X_train - X_mean) / X_std
X_val  = (X_val  - X_mean ) / X_std

X_train = torch.tensor(X_train.reshape(-1, 64), dtype=torch.float32)
X_val = torch.tensor(X_val.reshape(-1, 64), dtype=torch.float32)

# yの値をone-hotしないようにする
y_train = torch.tensor(y_train)
y_val = torch.tensor(y_val)

batch_size = 30
num_in = 64
num_hidden = 30
num_out = 10
num_batches = np.ceil(len(y_train)/batch_size).astype(int)


learning_rate = 0.01

# モデルの初期化
# model = MLP(num_in, num_hidden, num_out)
# opt = Optimizer(parameters = model.parameters(), lr=learning_rate)

# ログ
train_losses = []
val_losses = []
val_accuracies = []

for epoch in range(100):
    # エポックごとにデータをシャッフルする
    shuffled_indices = np.random.permutation(len(y_train))

    running_loss = 0.0

    for i in range(num_batches):
    
        # mini batchの作成
        start = i * batch_size
        end = start + batch_size
    
        batch_indices = shuffled_indices[start:end]
        y = y_train[batch_indices] # batch_size x 10
    
        X = X_train[batch_indices, :] # batc_size x 64
        # 順伝播と逆伝播の計算
        preds  = model(X)
        loss = F.cross_entropy(preds, y)
        loss.backward()
        running_loss += loss.item()
    
        # パラメータ更新 ->ここをOptimizerクラスにする
        # with torch.no_grad():
        #     for param in model.parameters():
        #         param -= learning_rate * param.grad
    
        # model.zero_grad()
        opt.step()
        opt.zero_grad()

    # validtion
    with torch.no_grad():
        preds_val = model(X_val)
        val_loss = F.cross_entropy(preds_val, y_val)
    
        val_accuracy = torch.sum(torch.argmax(preds_val, dim=-1) == y_val) / y_val.shape[0]

    train_losses.append(running_loss/num_batches)
    val_losses.append(val_loss.item())
    val_accuracies.append(val_accuracy)
    print(f'epoch : {epoch}: train error : {running_loss/num_batches}, validation error : {val_loss.item()},val accuracy {val_accuracy.item()}')



epoch : 0: train error : 0.08561067162857701, validation error : 0.11765244603157043,val accuracy 0.9638888835906982
epoch : 1: train error : 0.08494534677204986, validation error : 0.11746018379926682,val accuracy 0.9638888835906982
epoch : 2: train error : 0.08418877442212154, validation error : 0.11693070828914642,val accuracy 0.9611111283302307
epoch : 3: train error : 0.0833802418395256, validation error : 0.11633049696683884,val accuracy 0.9694444537162781
epoch : 4: train error : 0.0826346522080712, validation error : 0.11611621826887131,val accuracy 0.9638888835906982
epoch : 5: train error : 0.08240169077180326, validation error : 0.11554761230945587,val accuracy 0.9611111283302307
epoch : 6: train error : 0.08171700371894985, validation error : 0.1158042624592781,val accuracy 0.9638888835906982
epoch : 7: train error : 0.08074743023219828, validation error : 0.11580435186624527,val accuracy 0.9583333134651184
epoch : 8: train error : 0.08011438868318994, validation error : 0.

## torch.optim

In [10]:
from torch import optim
opt = optim.SGD(model.parameters(), lr = learning_rate)