# 오차역전파법

## 단순한 계층 구현하기

In [10]:
import numpy as np
import pandas as pd

In [3]:
class MulLayer :
    def __init__(self) :
        self.x = None
        self.y = None
    
    def forward(self,x,y) :
        self.x = x 
        self.y = y
        out = x*y
        return out
    
    def backward(self, dout) :
        dx = dout * self.y
        dy = dout*self.x
        
        return dx, dy

In [16]:
apple = 100
apple_num =2
tax=1.1

# 계층들
mul_apple_layer = MulLayer()
mul_tax_layer = MulLayer()

# 순전파
apple_price = mul_apple_layer.forward(apple, apple_num)
price = mul_tax_layer.forward(apple_price, tax)

print(price)

220.00000000000003


In [14]:
dprice = 1
dapple_price, dtax = mul_tax_layer.backward(dprice)
dapple, dapple_num = mul_apple_layer.backward(dapple_price)

print(dapple, np.round(dapple_num), dtax)

2.2 110.0 200


## 활성화 함수 계층 구현하기

In [19]:
class Relu : 
    def __init__(self) :
        self.mask = None
    
    def forward(self,x) :
        self.mask = (x<=0)
        out = x.copy()
        out[self.mask] = 0
        return out
    
    def backward(self, dout) :
        dout[self.mask] = 0
        dx = dout
        
        return dx

In [21]:
x = np.array([[1.0, -0.5],[-2.0, 3.0]])
print(x)

[[ 1.  -0.5]
 [-2.   3. ]]


In [22]:
mask = (x<=0)
print(mask)

[[False  True]
 [ True False]]


In [29]:
out = x.copy()
out[mask] = 0
out

array([[1., 0.],
       [0., 3.]])

In [23]:
rl = Relu()
forw = rl.forward(x)

In [24]:
forw

array([[1., 0.],
       [0., 3.]])

In [28]:
rl.backward(forw)

array([[1., 0.],
       [0., 3.]])

## sigmoid 계층

In [1]:
class sigmoid :
    def __init__(self) :
        self.out = None
    def forward(self, x) :
        out = 1/(1+np.exp(-x))
        self.out = out
        return out
    def backward(self,dout) :
        dx = dout*(1.0-self.out)*self.out
        return dx

## Affine/Softmax 계층

In [3]:
x = np.random.rand(2)
w = np.random.rand(2,3)
b = np.random.rand(3)

x.shape, w.shape, b.shape

((2,), (2, 3), (3,))

In [10]:
# y = np.dot(X,W) + B
X_dot_W = np.array([[0,0,0],[10,10,10]])
B = np.array([1,2,3])
X_dot_W+B

array([[ 1,  2,  3],
       [11, 12, 13]])

In [14]:
dY = np.array([[1,2,3],[4,5,6]])
dY
dB = np.sum(dY, axis=0)
dB

array([5, 7, 9])

In [15]:
class Affine :
    def __init__(self):
        self.W = W
        self.b = b
        self.x = None
        self.dW = None
        self.db = None
    
    def forward(self,x) :
        self.x = x
        out = np.dot(self.x, self.W) + self.b
        return out
    
    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)
        return dx

## Softmax-with-Loss 계층

In [16]:
class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None
        self.y = None
        self.t = None
    
    def forward(self, x, t) :
        self.t = t
        self.y = softmax(x)
        self.loss = cross_entorpy_error(self.t, self.y)
        return self.loss
    
    def backward(self, dout = 1) :
        batch_size = self.t.shpae[0]
        dx = (self.y - self.t)/batch_size
        return dx

## 오차역전파법 구현하기

In [19]:
import sys,os
sys.path.append(os.pardir)
from common.layers import *
from common.gradient import numerical_gradient
from collections import OrderedDict

In [72]:
class TwoLayerNet :
    def __init__(self, input_size, hidden_size, output_size, weight_init_std = 0.01):
        # 가중치 초기화
        self.params = {}
        # 가중치의 표준편차 지정(weight_init_std)
        self.params['W1'] = weight_init_std *\
                            np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std *\
                            np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)
        
        # 계층 생성
        self.layers = OrderedDict()
        self.layers['Affine1'] = \
            Affine(self.params['W1'], self.params['b1'])
        self.layers['Relu1'] = Relu()
        self.layers['Affine2'] = \
            Affine(self.params['W2'], self.params['b2'])

        self.lastLayer = SoftmaxWithLoss()
        
    def predict(self, x) :
        # layer 순서대로 지나가면서 predict
        for layer in self.layers.values():
            x = layer.forward(x)
        return x
    
    # x:입력데이터, t:정답레이블, y : 예측레이블
    
    def loss(self, x, t):
        # y와 t 사이의 차이, 손실함수 계산
        y = self.predict(x)
        return self.lastLayer.forward(y,t)
    
    def accuracy(self, x, t) :
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        if t.ndim != 1 : t = np.argmax(t, axis=1)
        # ndim : 배열의 차수
        # np.argmax : 최대값에 해당하는 색인 
        # -> 1차원 배열이 아니면 각 차수 별 위치하는 최대값의 색인 배열 추출
        # t = np.array(([0,2,0,0],[1,0,0,0],[0,0,0,3]))
        # if t.ndim != 1 : t = np.argmax(t, axis=1)
        # t = array([1, 0, 3], dtype=int64)
        ## 만약 1차수면...?
        accuracy = np.sum(y==t)/float(x.shape[0])    
        return accuracy
    
    def numerical_gradient(self, x, t) :
        loss_W = lambda W :self.loss(x,t)
        
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        return grads
    
    def gradient(self, x, t) :
        # 순전파
        self.loss(x,t)
        
        # 역전파
        dout = 1
        dout = self.lastLayer.backward(dout)
        
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers :
            dout = layer.backward(dout)
        
        # 결과 저장
        grads = {}
        grads['W1'] = self.layers['Affine1'].dW
        grads['b1'] = self.layers['Affine1'].db
        grads['W2'] = self.layers['Affine2'].dW
        grads['b2'] = self.layers['Affine2'].db
        
        return grads

In [75]:
from mnist import load_mnist

(x_train, t_train),(x_test,t_test) = \
    load_mnist(normalize = True, one_hot_label=True)

network = TwoLayerNet(input_size = 784, hidden_size= 50, output_size= 10)

x_batch= x_train[:3]
# x_train.shape = (60000, 784)
t_batch = t_train[:3]
# t_train.shape = (60000, 10)

grad_numerical = network.numerical_gradient(x_batch, t_batch)
grad_backprop = network.gradient(x_batch,t_batch)

# 각 가중치의 차이의 절대값을 구한 후, 그 절댓값들의 평균을 낸다.
for key in grad_numerical.keys():
    diff = np.average(np.abs(grad_backprop[key] - grad_numerical[key]))
    print(key + ":" + str(diff))

W1:4.3053766153926593e-10
b1:2.7089561639432986e-09
W2:6.250787527980453e-09
b2:1.4014091879038791e-07


In [83]:
(x_train, t_train),(x_test,t_test) = \
    load_mnist(normalize = True, one_hot_label=True)

network = TwoLayerNet(input_size = 784, hidden_size= 50, output_size= 10)

iters_num = 10000 # 반복횟수
train_size = x_train.shape[0] # 60000
batch_size =100
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = max(train_size/batch_size,1) # 1에폭당 반복수
# iter_per_epoch = max(60000/100,1) = 600

for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size) # 60000개 중에 batch 사이즈인 100개 랜덤 선택
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    grad = network.gradient(x_batch, t_batch)
    
    # 갱신
    for key in ('W1','b1','W2','b2') :
        network.params[key] -= learning_rate*grad[key]
    
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    
    if i % iter_per_epoch ==0 :
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print(train_acc, test_acc)

0.07633333333333334 0.0769
0.90535 0.9079
0.9244333333333333 0.9255
0.9358 0.9353
0.94505 0.9434
0.9507666666666666 0.9459
0.9552666666666667 0.9501
0.9592333333333334 0.9534
0.96195 0.9546
0.9648666666666667 0.9598
0.9679 0.9615
0.9701666666666666 0.9618
0.97235 0.9639
0.9731166666666666 0.9651
0.9748333333333333 0.9678
0.97615 0.9659
0.9764 0.9669
