# backpropagation

## backwardを実装

In [182]:
import torch
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt

import torch.nn.functional as F
from sklearn.model_selection import train_test_split

In [195]:
def linear_backward(A, W, b, Z):
    W.grad_ = Z.grad_.T @ A
    b.grad_ = torch.sum(Z.grad_, dim=0)
    A.grad_ = Z.grad_ @ W

def relu_backward(Z, A):
    Z.grad_ = A.grad_ * (Z > 0).float()

# softmax
def softmax(x):
    e_x = torch.exp(x - torch.max(x, dim=-1, keepdim=True)[0])
    return e_x / (torch.sum(e_x, dim=-1, keepdim=True) + 1e-10)

# cross_entropy
def cross_entropy(y_true, y_pred):
    return torch.sum(y_true * torch.log(y_pred)) / y_true.shape[0]

# softmax関数とcross_entropyを一つにまとめる
def softmax_cross_entorpy(x, y_true):
    e_x = torch.exp(x - torch.max(x, dim=-1, keepdim=True)[0])
    softmax_out = e_x / (torch.sum(e_x, dim=-1, keepdim=True) + 1e-10)
    loss = -torch.sum(y_true * torch.log(softmax_out + 1e-10)) / y_true.shape[0]
    return loss, softmax_out
    
    

In [196]:
# model
def linear(X, W, b):
    return X@W.T + b

def relu(Z):
    # .clapm_minの中より小さいのは()で置き換える
    return Z.clamp_min(0.)

def forward_and_backward(X, y):
    # forward
    Z1 = linear(X, W1, b1)
    Z1.retain_grad()
    A1 = relu(Z1)
    A1.retain_grad()
    Z2 = linear(A1, W2, b2)
    Z2.retain_grad()
    
    # 出力層
    loss, A2 = softmax_cross_entorpy(Z2, y)

    # backward
    Z2.grad_ = (A2 - y)/ X.shape[0]
    linear_backward(A1, W2, b2, Z2)
    relu_backward(Z1, A1)
    linear_backward(X, W1, b1, Z1)
    return loss, Z1, A1, Z2, A2
    


## Autogradとスクラッチで実装したbackpropagationが一致することを確かめる

In [197]:
# MNISTデータをロード
dataset = datasets.load_digits()

In [198]:
images = dataset['images']
target = dataset['target']

In [199]:
# 確認
print(images.shape)
print(target.shape)

(1797, 8, 8)
(1797,)


In [200]:
# 前処理と画像の標準化とデータ分割
# データの分割

X_train, X_test, y_train, y_test = train_test_split(images, target, test_size=0.2, random_state=42)


# one-hot
y_train = F.one_hot(torch.tensor(y_train), num_classes = 10)
y_test = F.one_hot(torch.tensor(y_test), num_classes=10)
X_train = torch.tensor(X_train, dtype=torch.float32).reshape(-1, 64)
# import pdb; pdb.set_trace()
X_test  = torch.tensor(X_test, dtype=torch.float32).reshape(-1, 64)

# 標準化
X_mean = X_train.mean()
X_std = X_train.std()
X_train = (X_train - X_mean) / X_std
X_test =  (X_test - X_mean) / X_std



In [201]:
# パラメータの初期化
nh = 30
class_num = 10
m, n = X_train.shape # 入力

# W1 = torch.randn((nh, n), requires_grad=True)
W1 = torch.randn((nh, n ) ) * torch.sqrt(torch.tensor((2./n)))
W1.requires_grad = True
b1 = torch.zeros((1, nh), requires_grad=True)

# W2 = torch.randn((class_num, nh), requires_grad=True)
W2 = torch.randn((class_num, nh ) ) * torch.sqrt(torch.tensor((2./nh)))
W2.requires_grad = True

b2 = torch.zeros((1, class_num), requires_grad=True)

loss, Z1, A1, Z2, A2 = forward_and_backward(X_train, y_train)
loss.backward()

# autogradと等しいことを確認する
print(torch.allclose(W1.grad_, W1.grad))
print(torch.allclose(b1.grad_, b1.grad))
print(torch.allclose(W2.grad_, W2.grad))
print(torch.allclose(b2.grad_, b2.grad))


True
True
True
True


In [202]:
W1.grad_

tensor([[-0.0099, -0.0087,  0.0038,  ...,  0.0252,  0.0110, -0.0019],
        [-0.0284, -0.0241,  0.0220,  ..., -0.0099, -0.0318, -0.0302],
        [-0.0336, -0.0374, -0.0598,  ...,  0.0433, -0.0011, -0.0251],
        ...,
        [-0.0999, -0.0988, -0.0558,  ...,  0.1000, -0.0427, -0.0972],
        [-0.0093, -0.0086, -0.0055,  ...,  0.0215,  0.0052, -0.0091],
        [ 0.0069,  0.0066,  0.0066,  ..., -0.0181,  0.0043,  0.0082]],
       grad_fn=<MmBackward0>)

In [None]:
W1.grad

In [None]:
torch.sum(W1.grad_ == W1.grad)

In [None]:
Z2.grad_

In [None]:
Z2.grad

In [None]:
torch.sum(Z2.grad_ == Z2.grad)