# backpropagation

## backwardを実装

In [58]:
import torch
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt

import torch.nn.functional as F
from sklearn.model_selection import train_test_split

In [59]:
def linear_backward(A, W, b, Z):
    W.grad_ = Z.grad_.T @ A
    b.grad_ = torch.sum(Z.grad_, dim=0)
    A.grad_ = Z.grad_ @ W

def relu_backward(Z, A):
    Z.grad_ = A.grad_ * (Z > 0).float()

# softmax
def softmax(x):
    e_x = torch.exp(x - torch.max(x, dim=-1, keepdim=True)[0])
    return e_x / (torch.sum(e_x, dim=-1, keepdim=True) + 1e-10)

# cross_entropy
def cross_entropy(y_true, y_pred):
    return torch.sum(y_true * torch.log(y_pred)) / y_true.shape[0]

# softmax関数とcross_entropyを一つにまとめる
def softmax_cross_entropy(x, y_true):
    e_x = torch.exp(x - torch.max(x, dim=-1, keepdim=True)[0])
    softmax_out = e_x / (torch.sum(e_x, dim=-1, keepdim=True) + 1e-10)
    loss = -torch.sum(y_true * torch.log(softmax_out + 1e-10)) / y_true.shape[0]
    return loss, softmax_out
    
    

In [60]:
# model
def linear(X, W, b):
    return X@W.T + b

def relu(Z):
    # .clapm_minの中より小さいのは()で置き換える
    return Z.clamp_min(0.)

def forward_and_backward(X, y):
    # forward
    Z1 = linear(X, W1, b1)
    Z1.retain_grad()
    A1 = relu(Z1)
    A1.retain_grad()
    Z2 = linear(A1, W2, b2)
    Z2.retain_grad()
    
    # 出力層
    loss, A2 = softmax_cross_entorpy(Z2, y)

    # backward
    Z2.grad_ = (A2 - y)/ X.shape[0]
    linear_backward(A1, W2, b2, Z2)
    relu_backward(Z1, A1)
    linear_backward(X, W1, b1, Z1)
    return loss, Z1, A1, Z2, A2
    


## Autogradとスクラッチで実装したbackpropagationが一致することを確かめる

In [61]:
# MNISTデータをロード
dataset = datasets.load_digits()

In [62]:
images = dataset['images']
target = dataset['target']

In [63]:
# 確認
print(images.shape)
print(target.shape)

(1797, 8, 8)
(1797,)


In [64]:
# 前処理と画像の標準化とデータ分割
# データの分割

X_train, X_val, y_train, y_val = train_test_split(images, target, test_size=0.2, random_state=42)


# one-hot
y_train = F.one_hot(torch.tensor(y_train), num_classes = 10)
y_val = F.one_hot(torch.tensor(y_val), num_classes=10)
X_train = torch.tensor(X_train, dtype=torch.float32).reshape(-1, 64)
# import pdb; pdb.set_trace()
X_val  = torch.tensor(X_val, dtype=torch.float32).reshape(-1, 64)

# 標準化
X_mean = X_train.mean()
X_std = X_train.std()
X_train = (X_train - X_mean) / X_std
X_val =  (X_val - X_mean) / X_std



In [65]:
# パラメータの初期化
nh = 30
class_num = 10
m, n = X_train.shape # 入力

# W1 = torch.randn((nh, n), requires_grad=True)
W1 = torch.randn((nh, n ) ) * torch.sqrt(torch.tensor((2./n)))
W1.requires_grad = True
b1 = torch.zeros((1, nh), requires_grad=True)

# W2 = torch.randn((class_num, nh), requires_grad=True)
W2 = torch.randn((class_num, nh ) ) * torch.sqrt(torch.tensor((2./nh)))
W2.requires_grad = True
b2 = torch.zeros((1, class_num), requires_grad=True)

loss, Z1, A1, Z2, A2 = forward_and_backward(X_train, y_train)
loss.backward()

# autogradと等しいことを確認する
print(torch.allclose(W1.grad_, W1.grad))
print(torch.allclose(b1.grad_, b1.grad))
print(torch.allclose(W2.grad_, W2.grad))
print(torch.allclose(b2.grad_, b2.grad))


True
True
True
True


In [66]:
W1.grad_

tensor([[-1.2174e-03, -1.3511e-03,  6.1334e-07,  ..., -2.3137e-04,
         -1.9084e-03, -1.2174e-03],
        [-1.0238e-02, -8.5031e-03,  1.1822e-02,  ..., -7.3752e-03,
         -2.1876e-02, -1.2010e-02],
        [ 5.6885e-03,  8.3885e-03,  1.1831e-02,  ..., -6.6835e-03,
          1.4025e-02,  7.0614e-03],
        ...,
        [-3.5550e-04, -4.0596e-03, -2.1640e-02,  ..., -5.0139e-02,
         -8.0396e-03, -4.6834e-04],
        [-5.9436e-02, -6.1237e-02, -3.8506e-02,  ...,  3.7576e-02,
         -3.2430e-02, -4.8552e-02],
        [-2.0860e-03, -1.1502e-03,  2.5302e-03,  ..., -2.2132e-02,
         -5.9467e-03, -1.8114e-03]], grad_fn=<MmBackward0>)

In [67]:
W1.grad

tensor([[-1.2174e-03, -1.3511e-03,  6.1343e-07,  ..., -2.3137e-04,
         -1.9084e-03, -1.2174e-03],
        [-1.0238e-02, -8.5031e-03,  1.1822e-02,  ..., -7.3752e-03,
         -2.1876e-02, -1.2010e-02],
        [ 5.6885e-03,  8.3885e-03,  1.1831e-02,  ..., -6.6835e-03,
          1.4025e-02,  7.0614e-03],
        ...,
        [-3.5550e-04, -4.0596e-03, -2.1640e-02,  ..., -5.0139e-02,
         -8.0396e-03, -4.6834e-04],
        [-5.9436e-02, -6.1237e-02, -3.8506e-02,  ...,  3.7576e-02,
         -3.2430e-02, -4.8552e-02],
        [-2.0860e-03, -1.1502e-03,  2.5302e-03,  ..., -2.2132e-02,
         -5.9467e-03, -1.8114e-03]])

In [68]:
torch.sum(W1.grad_ == W1.grad)

tensor(759)

In [69]:
Z2.grad_

tensor([[ 1.3169e-06,  4.4577e-07,  6.0444e-05,  ...,  9.8272e-06,
          2.7154e-06,  1.0628e-05],
        [-6.9043e-04,  8.6426e-06,  1.4949e-05,  ...,  7.4142e-05,
          1.2745e-04,  4.6533e-05],
        [-6.9520e-04,  3.0745e-06,  5.9431e-05,  ...,  8.0557e-05,
          1.1413e-05,  6.5011e-05],
        ...,
        [ 2.1566e-05,  5.2735e-06, -6.6634e-04,  ...,  3.8326e-05,
          9.8389e-05,  1.0916e-05],
        [ 1.1258e-06,  8.0442e-06,  2.3198e-05,  ..., -5.2240e-04,
          2.0230e-04,  1.4130e-05],
        [ 4.0691e-06, -6.8526e-04,  8.0865e-05,  ...,  1.3706e-04,
          1.3624e-04,  3.6407e-05]], grad_fn=<DivBackward0>)

In [70]:
Z2.grad

tensor([[ 1.3169e-06,  4.4577e-07,  6.0444e-05,  ...,  9.8272e-06,
          2.7154e-06,  1.0628e-05],
        [-6.9043e-04,  8.6426e-06,  1.4949e-05,  ...,  7.4142e-05,
          1.2745e-04,  4.6533e-05],
        [-6.9520e-04,  3.0745e-06,  5.9431e-05,  ...,  8.0557e-05,
          1.1413e-05,  6.5011e-05],
        ...,
        [ 2.1566e-05,  5.2735e-06, -6.6634e-04,  ...,  3.8326e-05,
          9.8389e-05,  1.0916e-05],
        [ 1.1258e-06,  8.0442e-06,  2.3198e-05,  ..., -5.2240e-04,
          2.0230e-04,  1.4130e-05],
        [ 4.0691e-06, -6.8526e-04,  8.0865e-05,  ...,  1.3706e-04,
          1.3624e-04,  3.6407e-05]])

In [71]:
torch.sum(Z2.grad_ == Z2.grad)

tensor(6904)

In [72]:
# バッチサイズ
learning_rate = 0.03
batch_size = 30
num_batches = np.ceil((len(y_train)) / batch_size).astype(int)

# パラメータの初期化
nh = 30
class_num = 10
m, n = X_train.shape # 入力

# W1 = torch.randn((nh, n), requires_grad=True)
W1 = torch.randn((nh, n ) ) * torch.sqrt(torch.tensor((2./n)))
W1.requires_grad = True
b1 = torch.zeros((1, nh), requires_grad=True)

# W2 = torch.randn((class_num, nh), requires_grad=True)
W2 = torch.randn((class_num, nh ) ) * torch.sqrt(torch.tensor((2./nh)))
W2.requires_grad = True
b2 = torch.zeros((1, class_num), requires_grad=True)
# ログ
train_losses = []
val_losses = []
val_accuracies = []

# 学習率の準備
learning_rate = 0.3
# 各イテレーションの損失を入れる
loss_log = []

epoches = 30
for epoch in range(epoches):
    shuffle_indices = np.random.permutation(len(y_train))
    running_loss = 0
    # それぞれのepochではデータを持ってきて、zを計算損失計算、softmax計算
    for i in range(num_batches):

        # ミニバッチ作成
        start = i * batch_size
        end = start + batch_size
        batch_indices = shuffle_indices[start:end]
        
        # 6. 入力データxおよび教師ラベルyを作成
        y_true_ = y_train[batch_indices, :]  # データ数 x　クラス数
        X = X_train[batch_indices, :] # データ数 x 特徴量
        # import pdb; pdb.set_trace()
        
        # 7. z計算
        # Z = X@W.T + b  # -> MLP
        Z1 = linear(X, W1, b1)
        Z1.retain_grad()
        A1 = relu(Z1)
        A1.retain_grad()
        Z2 = linear(A1, W2, b2)
        Z2.retain_grad()

        loss, A2 = softmax_cross_entorpy(Z2, y_true_)

        # 9. 損失計算
        loss_log.append(loss.item())
        # epochごとのlossを計算する
        running_loss += loss.item()

        # 10. 勾配を計算する
        Z2.grad_ = (A2 - y_true_)/ X.shape[0]
        linear_backward(A1, W2, b2, Z2)
        relu_backward(Z1, A1)
        linear_backward(X, W1, b1, Z1)

        # 11. パラメータ更新
        with torch.no_grad():
            W1 -= learning_rate * W1.grad_  # 自作した.grad_
            W2 -= learning_rate * W2.grad_  # 自作した.grad_
            
            b1 -= learning_rate * b1.grad_
            b2 -= learning_rate * b2.grad_

        # 12. 勾配初期化
            W1.grad_ = None
            W2.grad_ = None
            b1.grad_ = None
            b2.grad_ = None

    # validation
    with torch.no_grad():
        Z1_val = linear(X_val, W1, b1)
        A1_val = relu(Z1_val)
        Z2_val = linear(A1_val, W2, b2)
        val_loss,A2_val = softmax_cross_entropy(Z2_val, y_val)

        # val_loss = cross_entropy(y_val, y_pred_val)

        # accuracy
        val_accuracy = torch.sum(torch.argmax(y_val, dim=-1) == torch.argmax(y_val, dim=-1)) / y_val.shape[0]

    train_losses.append(running_loss/num_batches)
    val_losses.append(val_loss.item())
    val_accuracies.append(val_accuracy.item())

    # 13. 損失logを出力
    print(f'epoch : {epoch+1}: train loss : {running_loss/num_batches}, val loss : {val_loss.item()}, val accuracy {val_accuracy.item()}')



epoch : 1: train loss : 0.6966667892411351, val loss : 0.2333393543958664, val accuracy 1.0
epoch : 2: train loss : 0.20995537509831289, val loss : 0.18444675207138062, val accuracy 1.0
epoch : 3: train loss : 0.12166053111044069, val loss : 0.13323885202407837, val accuracy 1.0
epoch : 4: train loss : 0.10371072012155007, val loss : 0.17466755211353302, val accuracy 1.0
epoch : 5: train loss : 0.07781402996139757, val loss : 0.135255828499794, val accuracy 1.0
epoch : 6: train loss : 0.05418967220854635, val loss : 0.12357524037361145, val accuracy 1.0
epoch : 7: train loss : 0.050140989401067294, val loss : 0.11388607323169708, val accuracy 1.0
epoch : 8: train loss : 0.0428865412104642, val loss : 0.18954601883888245, val accuracy 1.0
epoch : 9: train loss : 0.03508908559645837, val loss : 0.1086653620004654, val accuracy 1.0
epoch : 10: train loss : 0.030749320110771805, val loss : 0.10592357814311981, val accuracy 1.0
epoch : 11: train loss : 0.026157511500059627, val loss : 0.116

### 回帰モデルをスクラッチで実装

In [82]:
def mse(X, y):
    return (X[:0] - y ).pow(2).mean()

def forward_and_backward(X, y):
    # forward
    Z1 = linear(X, W1, b1)
    Z1.retain_grad()
    A1 = relu(Z1)
    A1.retain_grad()
    Z2 = linear(A1, W2, b2)
    Z2.retain_grad()
    
    # 出力層
    loss = mse(Z2, y)

    # backward
    # Z2.grad_ = (A2 - y)/ X.shape[0
    Z2.grad_ = 2 * (Z2 - y.unsqueeze(dim=-1)/ X.shape[0]
    linear_backward(A1, W2, b2, Z2)
    relu_backward(Z1, A1)
    linear_backward(X, W1, b1, Z1)
    return loss, Z1, A1, Z2, A2
    


SyntaxError: invalid syntax (1162775993.py, line 19)

In [90]:
# データの準備
dataset = datasets.load_digits()
X_train, X_val, y_train, y_val = train_test_split(images, target, test_size=0.2, random_state=42)

# one-hot
y_train = F.one_hot(torch.tensor(y_train), num_classes = 10)
y_val = F.one_hot(torch.tensor(y_val), num_classes=10)
X_train = torch.tensor(X_train, dtype=torch.float32).reshape(-1, 64)
# import pdb; pdb.set_trace()
X_val  = torch.tensor(X_val, dtype=torch.float32).reshape(-1, 64)

# 標準化
X_mean = X_train.mean()
X_std = X_train.std()
X_train = (X_train - X_mean) / X_std
X_val =  (X_val - X_mean) / X_std


# 回帰なのでonehotをする必要なし。これを修正する
y_train_reg = torch.argmax(y_train, dim=-1)

def mse(X, y):
    return (X[:, 0] - y ).pow(2).mean()

def forward_and_backward(X, y):
    # forward
    Z1 = linear(X, W1, b1)
    Z1.retain_grad()
    A1 = relu(Z1)
    A1.retain_grad()
    Z2 = linear(A1, W2, b2)
    Z2.retain_grad()
    
    # 出力層
    loss = mse(Z2, y)

    # backward
    Z2.grad_ = 2 * (Z2 - y.unsqueeze(dim=-1))/X.shape[0]
    linear_backward(A1, W2, b2, Z2)
    relu_backward(Z1, A1)
    linear_backward(X, W1, b1, Z1)
    return loss, Z1, A1, Z2, A2
    

# パラメータの初期化
nh = 30
m, n = X_train.shape # 入力

W1 = torch.randn((nh, n), requires_grad=True)
b1 = torch.zeros((1, nh), requires_grad=True)

W2 = torch.randn((1, nh), requires_grad=True)
b2 = torch.zeros((1, 1), requires_grad=True)

loss, Z1, A1, Z2, A2 = forward_and_backward(X_train, y_train_reg)
loss.backward()

# autogradと等しいことを確認する
print(torch.allclose(W1.grad_, W1.grad))
print(torch.allclose(b1.grad_, b1.grad))
print(torch.allclose(W2.grad_, W2.grad))
print(torch.allclose(b2.grad_, b2.grad))



True
True
True
True


## Refactoring

In [122]:
class Linear():
    def __init__(self, in_features, out_features, n):
        self.W = torch.randn((out_features, in_features)) * torch.sqrt(torch.tensor(2.0 / in_features))
        self.W.requires_grad = True
        self.b = torch.zeros((1, out_features), requires_grad=True)

    def forward(self, X):
        self.X = X
        self.Z = X @ self.W.T + self.b
        return self.Z

    def backward(self, Z):
        self.W.grad_ = Z.grad_.T @ self.X
        self.b.grad_ = torch.sum(Z.grad_, dim=0)
        self.X.grad_ = Z.grad_ @ self.W
        return self.X.grad_

class ReLU():
    def forward(self, X):
        self.X = X
        return X.clamp_min(0.)

    def backward(self, A):
        return A.grad_ * (self.X > 0).float()

class SoftmaxCrossEntropy:
    def forward(self, X, y):
        e_x = torch.exp(X - torch.max(X, dim=-1, keepdim=True)[0])
        self.softmax_out = e_x / (torch.sum(e_x, dim=-1, keepdim=True) + 1e-10)

        log_probs = torch.log(self.softmax_out + 1e-10)
        target_log_probs = log_probs * y

        self.loss = -target_log_probs.sum(dim=-1).mean()
        return self.loss
    def backward(self, y):
        return (self.softmax_out - y) / y.shape[0]

class Model:
    def __init__(self, input_features, hidden_units, output_units, data_num):
        self.linear1 = Linear(input_features, hidden_units, data_num)
        self.relu = ReLU()
        self.linear2 = Linear(hidden_units, output_units, data_num)
        self.loss_fn = SoftmaxCrossEntropy()

    def forward(self, X, y):
        self.X = X
        self.Z1 = self.linear1.forward(X)
        self.A1 = self.relu.forward(self.Z1)
        self.Z2 = self.linear2.forward(self.A1)
        self.loss = self.loss_fn.forward(self.Z2, y)
        return self.loss, self.Z2

    def backward(self, y):
        self.Z2.grad_ = self.loss_fn.backward(y)
        self.A1.grad_ = self.linear2.backward(self.Z2)
        self.Z1.grad_ = self.relu.backward(self.A1)
        self.X.grad_ = self.linear1.backward(self.Z1)

    def zero_grad(self):
        # 勾配の初期化
        self.linear1.W.grad_ = None
        self.linear1.b.grad_ = None
        self.linear2.W.grad_ = None
        self.linear2.b.grad_ = None

    def step(self, learning_rate):
        # パラメータの更新
        self.linear1.W -= learning_rate * self.linear1.W.grad_
        self.linear1.b -= learning_rate * self.linear1.b.grad_
        self.linear2.W -= learning_rate * self.linear2.W.grad_
        self.linear2.b -= learning_rate * self.linear2.b.grad_

## Refactoring後の学習ループ(OptimizerやDataset, Dataloaderの後にRefactaring)
# ===データの準備===
dataset = datasets.load_digits()
data = dataset['data']
target = dataset['target']
images = dataset['images']
X_train, X_val, y_train, y_val = train_test_split(images, target, test_size=0.2, random_state=42)
X_mean = X_train.mean()
X_std = X_train.std()

X_train = (X_train - X_mean) / X_std
X_val  = (X_val  - X_mean ) / X_std

X_train = torch.tensor(X_train.reshape(-1, 64), dtype=torch.float32)
X_val = torch.tensor(X_val.reshape(-1, 64), dtype=torch.float32)
y_train = F.one_hot(torch.tensor(y_train), num_classes=10)
y_val = F.one_hot(torch.tensor(y_val), num_classes=10)

batch_size = 30
# モデルの初期化
model = Model(input_features=64, hidden_units=10, output_units=10, data_num = batch_size)

learning_rate = 0.01

# ログ
train_losses = []
val_losses = []
val_accuracies = []

for epoch in range(100):
    # エポックごとにデータをシャッフルする
    shuffled_indices = np.random.permutation(len(y_train))
    num_batches = np.ceil(len(y_train)/batch_size).astype(int)
    running_loss = 0.0

    for i in range(num_batches):
    
        # mini batchの作成
        start = i * batch_size
        end = start + batch_size
    
        batch_indices = shuffled_indices[start:end]
        y_true_ = y_train[batch_indices, :] # batch_size x 10
    
        X = X_train[batch_indices, :] # batc_size x 64
        # 順伝播と逆伝播の計算
        loss, _ = model.forward(X, y_true_)
        model.backward(y_true_)
        running_loss += loss.item()
    
        # パラメータ更新
        with torch.no_grad():
            model.step(learning_rate)
    
        model.zero_grad()

    # validtion
    with torch.no_grad():
        val_loss, Z2_val = model.forward(X_val, y_val)

        val_accuracy = torch.sum(torch.argmax(Z2_val, dim=-1) == torch.argmax(y_val, dim=-1)) / y_val.shape[0]

    train_losses.append(running_loss/num_batches)
    val_losses.append(val_loss.item())
    val_accuracies.append(val_accuracy)
    print(f'epoch : {epoch}: train error : {running_loss/num_batches}, validation error : {val_loss.item()},val accuracy {val_accuracy.item()}')



epoch : 0: train error : 2.416108946005503, validation error : 2.1610004901885986,val accuracy 0.2777777910232544
epoch : 1: train error : 2.0587645545601845, validation error : 1.9678034782409668,val accuracy 0.34166666865348816
epoch : 2: train error : 1.8984086886048317, validation error : 1.8209500312805176,val accuracy 0.3861111104488373
epoch : 3: train error : 1.7664701044559479, validation error : 1.704996109008789,val accuracy 0.3888888955116272
epoch : 4: train error : 1.6615629196166992, validation error : 1.6097471714019775,val accuracy 0.4055555462837219
epoch : 5: train error : 1.5719226474563281, validation error : 1.5283621549606323,val accuracy 0.44999998807907104
epoch : 6: train error : 1.4895704612135887, validation error : 1.4531205892562866,val accuracy 0.46666666865348816
epoch : 7: train error : 1.4120475475986798, validation error : 1.3794163465499878,val accuracy 0.5083333253860474
epoch : 8: train error : 1.3359637074172497, validation error : 1.3100757598876

torch.Size([360, 10])

In [None]:
    e_x = torch.exp(x - torch.max(x, dim=-1, keepdim=True)[0])
    return e_x / (torch.sum(e_x, dim=-1, keepdim=True) + 1e-10)

# cross_entropy
def cross_entropy(y_true, y_pred):
    return torch.sum(y_true * torch.log(y_pred)) / y_true.shape[0]

# softmax関数とcross_entropyを一つにまとめる
def softmax_cross_entropy(x, y_true):
    e_x = torch.exp(x - torch.max(x, dim=-1, keepdim=True)[0])
    softmax_out = e_x / (torch.sum(e_x, dim=-1, keepdim=True) + 1e-10)
    loss = -torch.sum(y_true * torch.log(softmax_out + 1e-10)) / y_true.shape[0]
    return loss, softmax_out
    