# レポート課題3

1.　最もよく使う損失関数の一つであるSoftmax with lossレイヤーを実装する.

In [1]:
import numpy as np
from collections import OrderedDict

Cross Entropy Error関数
\begin{equation*}
E = -\sum_{k}t_{k}\log{y_{k}}
\end{equation*}

In [2]:
def cross_entropy_error(y, t):
    """損失関数:クロスエントロピー誤差"""
    if y.ndim == 1:
        t = t.reshape(1, t.size)
        y = y.reshape(1, y.size)
    # 教師データがone-hot-vectorの場合、正解ラベルのインデックスに変換
    if t.size == y.size:
        t = t.argmax(axis=1)
    batch_size = y.shape[0]
    return -np.sum(np.log(y[np.arange(batch_size), t])) / batch_size

softmax関数

In [3]:
def softmax(a):
    """ソフトマックス関数"""
    c=np.max(a)
    exp_a=np.exp(a-c)
    sum_exp_a=np.sum(exp_a)
    return exp_a/sum_exp_a

sigmoid関数

In [4]:
def sigmoid(x):
    """シグモイド関数"""
    return 1/(1+np.exp(-x))

In [5]:
def numerical_grad(f, x):
    """重みパラメータに対する勾配を数値微分によって求める"""
    h = 1e-4 # 0.0001
    grad = np.zeros_like(x)
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:
        idx = it.multi_index
        tmp_val = x[idx]
        x[idx] = float(tmp_val) + h
        fxh1 = f(x) # f(x+h)
        x[idx] = tmp_val - h 
        fxh2 = f(x) # f(x-h)
        grad[idx] = (fxh1 - fxh2) / (2*h)
        x[idx] = tmp_val # 値を元に戻す
        it.iternext()   
    return grad

SoftmaxWithLostレイヤはSoftmax関数と損失関数であるCrossEntropyErrorを含んでいる

In [6]:
class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None
        self.y = None # softmaxの出力
        self.t = None # 教師データ
    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)
        # forwardの式
        # -sum ( t * log (y))
        self.loss = cross_entropy_error(self.y, self.t)
        return self.loss
    def backward(self, dout=1):
        # backwardの式
        # yi - ti (iはIndex)
        batch_size = self.t.shape[0]
        # Backwardを実装して、微分値をdxに代入してください
        dx = (self.y - self.t)/batch_size
        return dx

In [7]:
class Relu:
    """活性化関数ReLUレイヤ"""
    def __init__(self):
        self.mask = None
    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0
        return out
    def backward(self, dout):
        dout[self.mask] = 0
        dx = dout
        return dx

In [8]:
class Affine:
    """Affine変換レイヤ"""
    def __init__(self, W, b):
        self.W =W
        self.b = b
        self.x = None
        self.original_x_shape = None
        # 重み・バイアスパラメータの微分
        self.dW = None
        self.db = None
    def forward(self, x):
        # テンソル対応
        self.original_x_shape = x.shape
        x = x.reshape(x.shape[0], -1)
        self.x = x
        out = np.dot(self.x, self.W) + self.b
        return out
    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)
        dx = dx.reshape(*self.original_x_shape)  # 入力データの形状に戻す（テンソル対応）
        return dx

In [9]:
class TwoLayerNet:
    """ 入力層のニューロン数:input_size
        隠れ層のニューロン数: hidden_size
        出力層のニューロン数:output_size
        の二層ニューラルネットワーク
    """
    def __init__(self, input_size, hidden_size, output_size, weight_init_std = 0.01):
        # 重みの初期化
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size) 
        self.params['b2'] = np.zeros(output_size)
        # レイヤの生成
        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
        self.layers['Relu1'] = Relu()
        self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])
        self.lastLayer = SoftmaxWithLoss()        
    
    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)
        return x#softmax関数に入る前
    
    # x:入力データ, t:教師データ
    def loss(self, x, t):
        y = self.predict(x)
        return self.lastLayer.forward(y, t)#損失を返す
    
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        grads = {}
        grads['W1'] = numerical_grad(loss_W, self.params['W1'])
        grads['b1'] = numerical_grad(loss_W, self.params['b1'])
        grads['W2'] = numerical_grad(loss_W, self.params['W2'])
        grads['b2'] = numerical_grad(loss_W, self.params['b2'])
        return grads
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        if t.ndim != 1:
            t = np.argmax(t,axis=1)
        try:
            accuracy = np.sum(y==t)/float(x.shape[0])
        except:
            print("zeroDivisionError")
            print("x:",x)
            print("x.shape",x)
            exit()
        return accuracy

MNISTの読み込み

In [10]:
import numpy as np
from dataset.mnist import load_mnist
# データの読み込み
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)#TwoLayerNetの入力層のニューロン, 隠れ層のニューロン, 出力層のニューロン
x_batch = x_train[:3]
t_batch = t_train[:3]

In [11]:
x_batch

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32)

In [12]:
x_batch.shape

(3, 784)

In [13]:
#それぞれ5,0,4が正解
t_batch

array([[ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.]])

In [14]:
t_batch.shape

(3, 10)

与えられたgradient関数は動かなかったので多少変更した.

In [15]:
def gradient(network, x, t):
    # 自分で実装したSoftmax with lossクラスを使ってみてください
    #lastLayer = SoftmaxWithLoss()
    # forward
    #self.loss(x, t)
    network.loss(x, t)
    # backward
    dout = 1
    dout = network.lastLayer.backward(dout)
    #layers = list(self.layers.values())
    layers = list(network.layers.values())
    layers.reverse()
    for layer in layers:
        dout = layer.backward(dout)
    # 設定
    grads = {}
    #grads['W1'], grads['b1'] = self.layers['Affine1'].dW, self.layers['Affine1'].db
    grads['W1'], grads['b1'] = network.layers['Affine1'].dW, network.layers['Affine1'].db
    #grads['W2'], grads['b2'] = self.layers['Affine2'].dW, self.layers['Affine2'].db
    grads['W2'], grads['b2'] = network.layers['Affine2'].dW, network.layers['Affine2'].db
    return grads

## 数値微分と誤差逆伝播法で求めた勾配が同じにならない……

In [16]:
# 数値微分
grad_numerical = network.numerical_gradient(x_batch, t_batch)

In [17]:
# Backward
#grad_backprop = gradient(x_batch, t_batch)
grad_backprop = gradient(network, x_batch, t_batch)

In [18]:
for key in grad_numerical.keys():
    diff = np.average( np.abs(grad_backprop[key] - grad_numerical[key]) )
    print(key + ":" + str(diff))

W1:9.2121465976e-05
b1:0.000717251479244
W2:0.00188098131842
b2:0.0666666667859


In [19]:
import numpy as np
from dataset.mnist import load_mnist
# データの読み込み
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1
train_loss_list = []
train_acc_list = []
test_acc_list = []
iter_per_epoch = max(train_size / batch_size, 1)

In [20]:
for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    # 勾配
    #rad = network.numerical_gradient(x_batch, t_batch)
    #grad = gradient(x_batch, t_batch)
    grad = gradient(network, x_batch, t_batch)
    # 更新
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print(train_acc, test_acc)

0.18245 0.1849




0.0987166666667 0.098
0.0986333333333 0.0958
0.0975166666667 0.0974




0.0987166666667 0.098
0.0987166666667 0.098
0.0987166666667 0.098
0.0987166666667 0.098
0.0987166666667 0.098
0.0987166666667 0.098
0.0987166666667 0.098
0.0987166666667 0.098
0.0987166666667 0.098
0.0987166666667 0.098
0.0987166666667 0.098
0.0987166666667 0.098
0.0987166666667 0.098


## 感想

勾配確認が上手く行かず,最後のaccuracyもかなり数字が小さい.どこかが間違っている.

どこが悪いのかまだ分からない.