numerical_gradient

def numerical_gradient(f, x):
    h = 1e-4 # 0.0001
    grad = np.zeros_like(x)
    
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:
        idx = it.multi_index
        tmp_val = x[idx]
        x[idx] = float(tmp_val) + h
        fxh1 = f(x) # f(x+h)
        
        x[idx] = tmp_val - h 
        fxh2 = f(x) # f(x-h)
        grad[idx] = (fxh1 - fxh2) / (2*h)
        
        x[idx] = tmp_val # 値を元に戻す
        it.iternext()   
        
    return grad
        
  

TwoLayerNet関数

In [8]:
import sys,os
sys.path.append(os.pardir)
import numpy as np
from common.layers import *
#from common.gradient import numerical_gradient
from collections import OrderedDict

class TwoLayerNet:
    
    def __init__ (self,input_size, hidden_size, output_size, 
                  weight_init_std=0.01):
        #重みの初期化
        self.params={}
        self.params['W1']=weight_init_std *\
                          np.random.randn(input_size, hidden_size)
        self.params['b1']=np.zeroms(hidden_size)
        self.params['W2']=weight_init_std *\
                          np.random.randn(hidden_size,output_size)
        self.params['b2']=np.zeros(output_size)
        
        #レイヤの生成
        self.layer=OrderedDict()
        self.layers['Affine1']=\
            Affine(self.params['W1'], self.params['b1'])
        self.layers['Relu1']=Relu()
        self.layers['Affine2']=\
            Affine(self.params['W2'], self.params['b2'])
            
        self.lastLayer=SoftmaxWithLoss()
    
    def predict(self,x):
        for layer in self.layers.values():
            x=layer.forward(x)
        return x
    
    #x:入力データ、t:教師データ
    def loss(self, x, t):
        y=self.predict(x)
        return self.lastLayer.forward(y, t)
    
    def accuracy(self, x, t):
        y=self.predict(x)
        y=np.argmax(y, axis=1)
        if t.ndim != 1 : t=np.argmax(t, axis=1)
        
        accuracy= np.sum(y==t)/ float(x.shape[0])
        return accuracy
    #x:入力データ、t:教師データ
    def numerical_gradient(self,x,t):
        loss_W = lambda W: self.loss(x,t)
        
        grads={}
        grads['W1']=numerical_gradiwnt(loss_W, self.params['W1'])
        grads['b1']=numerical_gradiwnt(loss_W, self.params['b1'])
        grads['W2']=numerical_gradiwnt(loss_W, self.params['W2'])
        grads['b2']=numerical_gradiwnt(loss_W, self.params['b2'])
        
        return grads
    
    def gradient(self, x, t):
        #forward
        self.loss(x, t)
        
        #backward
        dout=1
        dout=self.lastLayer.backward(dout)
        
        layers=list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout=layer.backward(dout)
        
        #設定
        grads={}
        grads['W1']=self.layers['Affine1'].dW
        grads['b1']=self.layers['Affine1'].db
        grads['W2']=self.layers['Affine1'].dW
        grads['b2']=self.layers['Affine1'].db
        
        return grads
            

勾配確認の実装

In [9]:
import sys,os
sys.path.append(os.pardir)
import numpy as np
from dataset.mnist import load_mnist
from two_layer_net import TwoLayerNet

#データの読み込み
(x_train, t_train), (x_test, t_test)=\
    load_mnist(normalize=True, one_hot_label=True)
    
network=TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

x_batch=x_train[:3]
t_batch=t_train[:3]

grad_numerical=network.numerical_gradient(x_batch, t_batch)
grad_backprop=network.gradient(x_batch, t_batch)

#各重みの絶対誤差の平均を求める
for key in grad_numerical.keys():
    diff=np.average(np.abs(grad_backprop[key] - grad_numerical[key]))
    print(key + ":" + str(diff))

W1:3.69170606073e-10
b1:2.23373927833e-09
W2:4.41465317642e-09
b2:1.39751208889e-07


In [10]:
import sys, os
sys.path.append(os.pardir)

import numpy as np
from dataset.mnist import load_mnist
from two_layer_net import TwoLayerNet

# データの読み込み
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)

network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)

iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1

train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = max(train_size / batch_size, 1)

for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    # 勾配
    #grad = network.numerical_gradient(x_batch, t_batch)
    grad = network.gradient(x_batch, t_batch)
    
    # 更新
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]
    
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    
    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print(train_acc, test_acc)

0.1234 0.1144
0.90355 0.9065
0.923666666667 0.9271
0.935916666667 0.9362
0.946783333333 0.9451
0.952433333333 0.9495
0.956633333333 0.9533
0.961683333333 0.9591
0.963916666667 0.9595
0.968083333333 0.9631
0.9669 0.9606
0.972233333333 0.965
0.973566666667 0.967
0.975966666667 0.9671
0.97615 0.9687
0.97905 0.9692
0.98005 0.9708
