In [1]:
import numpy as np
import matplotlib.pyplot as plt

np.__version__

'1.19.2'

In [2]:
# relu layer
class Relu:
    def __init__(self):
        self.mask = None

    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0
        return out

    def backward(self, dout):
        dout[self.mask] = 0
        dx = dout
        return dx

In [4]:
x = np.array([[1.0, -0.5], [-2.0, 3.0]])
mask = (x <= 0)
mask

array([[False,  True],
       [ True, False]])

In [5]:
# sigmoid layer
class Sigmoid:
    def __init__(self):
        self.out = None

    def forward(self, x):
        out = 1 / (1 + np.exp(-x))
        self.out = out
        return out

    def backward(self, dout):
        dx = dout * (1.0 - self.out) * self.out
        return dx

In [6]:
# affine layer
class Affine:
    def __init__(self, W, b):
        self.W = W
        self.b = b
        self.x = None
        self.dW = None
        self.db = None

    def forward(self, x):
        self.x = x
        out = np.dot(x, self.W) + self.b
        return out

    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)
        return dx

In [7]:
# softmax with loss
class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None
        self.y = None
        self.t = None

    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)
        return self.loss

    def backward(self, dout=1):
        batch_size = self.t.shape[0]
        dx = (self.y - self.t) / batch_size
        return dx

In [9]:
class SGD:
    def __init__(self, lr=0.01):
        self.lr = lr

    def update(self, params, grads):
        for key in params.keys():
            params[key] -= self.lr * grads[key]

In [18]:
class Momentum:
    def __init__(self, lr=0.01, momentum=0.9):
        self.lr = lr
        self.momentum = momentum
        self.v = None

    def update(self, params, grads):
        if self.v is None:
            self.v = {}
            for key, val in params.items():
                self.v[key] = np.zeros_like(val)

        for key in params.keys():
            self.v[key] = self.momentum * self.v[key] - self.lr * grads[key]
            params[key] += self.v[key]

In [20]:
class AdaGrad:
    def __init__(self, lr=0.01):
        self.lr = lr
        self.h = None

    def update(self, params, grads):
        if self.h is None:
            self.h = {}
            for key, val in params.items():
                self.h[key] = np.zeros_like(val)

        for key in params.keys():
            self.h[key] += grads[key] * grads[key]
            params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7)

In [13]:
from functions import *
from gradient import *

class TwoLayerNet:
    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)

    def predict(self, x):
        W1, W2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['b1'], self.params['b2']

        a1 = x @ W1 + b1
        z1 = sigmoid(a1)
        a2 = z1 @ W2 + b2
        y = softmax(a2)
        return y

    def loss(self, x, t):
        y = self.predict(x)
        return cross_entropy_error(y, t)

    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        if t.ndim != 1: t = np.argmax(t, axis=1)

        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy

    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)

        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        return grads

In [14]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

x, t = load_digits(return_X_y=True)
x.shape, t.shape

((1797, 64), (1797,))

In [15]:
x_train, x_test, t_train, t_test = train_test_split(x, t, test_size=.2, random_state=0)
x_train.shape, t_train.shape, x_test.shape, t_test.shape

((1437, 64), (1437,), (360, 64), (360,))

In [21]:
train_loss_list = []
train_acc_list = []
test_acc_list = []

iters_num = 10
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1
network = TwoLayerNet(input_size=64, hidden_size=50, output_size=10)
optimizer = AdaGrad()

iter_per_epoch = 1 #max(train_size / batch_size, 1)

for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    # print(x_batch.shape, t_batch.shape)

    grads = network.numerical_gradient(x_batch, t_batch)
    params = network.params
    optimizer.update(params, grads)

    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    print(f'{i} iter, loss: {loss}')

    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print(f'train acc:{train_acc}, test acc: {test_acc}')

0 iter, loss: 2.238775792255852
train acc:0.18580375782881003, test acc: 0.15
1 iter, loss: 2.2050677831712426
train acc:0.16423103688239388, test acc: 0.18888888888888888
2 iter, loss: 2.178150780014593
train acc:0.3535142658315936, test acc: 0.2833333333333333
3 iter, loss: 2.083125412147497
train acc:0.24147529575504523, test acc: 0.20555555555555555
4 iter, loss: 2.0929829338320562
train acc:0.2776617954070981, test acc: 0.22777777777777777
5 iter, loss: 1.9739716135111387
train acc:0.40292275574112735, test acc: 0.3472222222222222
6 iter, loss: 1.9668504216003768
train acc:0.38552540013917885, test acc: 0.3388888888888889
7 iter, loss: 1.9231551403937563
train acc:0.43910925539318024, test acc: 0.41944444444444445
8 iter, loss: 1.8998027460727547
train acc:0.5142658315935977, test acc: 0.475
9 iter, loss: 1.8569994578793487
train acc:0.5469728601252609, test acc: 0.5222222222222223
