# 1.3.1 Loss Function

In [1]:
class SigmoidWithLoss:
    def __init__(self):
        self.params = list()
        self.grads = list()
        self.loss = None
        self.y = None  
        self.t = None 

    def forward(self, x, t):
        self.t = t
        self.y = 1 / (1 + np.exp(-x))

        self.loss = cross_entropy_error(np.c_[1 - self.y, self.y], self.t)

        return self.loss

    def backward(self, dout=1):
        batch_size = self.t.shape[0]

        dx = (self.y - self.t) * dout / batch_size
        return dx

# 1.3.2 Gradient

$ \frac{\partial L}{\partial X} = (\frac{\partial L}{\partial x_1}, \frac{\partial L}{\partial x_2}... \frac{\partial L}{\partial x_n})$

# 1.3.3 Chain Lule

$ \frac{\partial X}{\partial Z} = \frac{\partial X}{\partial Y} \frac{\partial Y}{\partial Z}$

# 1.3.4 Computation Graph

## 1.3.4.1 Multiplication

## 1.3.4.2 Branch

## 1.3.4.3 Repeat

In [2]:
import numpy as np

In [3]:
D = 8
N = 7
x = np.random.randn(1, D)
y = np.repeat(x, N, axis=0) # forward
dy = np.random.randn(N, D) # gradient
dx = np.sum(dy, axis=0, keepdims=True) # backward
print("x:\n{}\n".format(x))
print("y:\n{}\n".format(y))
print("dy:\n{}\n".format(dy))
print("dx:\n{}\n".format(dx))

x:
[[ 1.95483672  0.77489131  0.26713495  1.41444588 -0.48027678 -0.77894386
   1.19416766 -0.45450741]]

y:
[[ 1.95483672  0.77489131  0.26713495  1.41444588 -0.48027678 -0.77894386
   1.19416766 -0.45450741]
 [ 1.95483672  0.77489131  0.26713495  1.41444588 -0.48027678 -0.77894386
   1.19416766 -0.45450741]
 [ 1.95483672  0.77489131  0.26713495  1.41444588 -0.48027678 -0.77894386
   1.19416766 -0.45450741]
 [ 1.95483672  0.77489131  0.26713495  1.41444588 -0.48027678 -0.77894386
   1.19416766 -0.45450741]
 [ 1.95483672  0.77489131  0.26713495  1.41444588 -0.48027678 -0.77894386
   1.19416766 -0.45450741]
 [ 1.95483672  0.77489131  0.26713495  1.41444588 -0.48027678 -0.77894386
   1.19416766 -0.45450741]
 [ 1.95483672  0.77489131  0.26713495  1.41444588 -0.48027678 -0.77894386
   1.19416766 -0.45450741]]

dy:
[[ 0.16811354  1.00425853 -0.81644633  0.37138064  0.19343741  0.77807566
   0.93497386  1.4104675 ]
 [ 0.51592855  0.29221703  1.29427547 -0.04570911  0.41562169  0.4281932
  -0

## 1.3.4.4 Sum

In [4]:
D = 8
N = 7
x = np.random.randn(N, D)
y = np.sum(x, axis=0, keepdims=True) # forward
dy = np.random.randn(1, D) # gradient
dx = np.repeat(dy, N, axis=0) # backward
print("x:\n{}\n".format(x))
print("y:\n{}\n".format(y))
print("dy:\n{}\n".format(dy))
print("dx:\n{}\n".format(dx))

x:
[[ 0.51508121  0.29003736  0.42900966 -0.49818843 -0.37028231 -1.44660693
  -1.95696758  0.10767352]
 [-1.06022734 -1.06835591  0.01333767  0.82094468 -0.39445195 -0.6912514
   0.22374962 -0.34125034]
 [ 0.43767875 -2.16004108  0.76272881  0.85319377  0.40110396  0.17212696
   0.82244438  0.38426659]
 [ 1.23347096  1.31467389  0.50216413 -0.52750641 -0.37798191  0.27052371
   0.03309953  0.18289856]
 [ 0.34142316  1.49135943  0.54762872  1.83318555  1.25830794 -0.58098627
  -0.24456501 -0.25168757]
 [ 1.0913231   1.48800177  0.23052024  1.62312557 -0.36053911 -0.1086271
  -1.29462005 -1.32395419]
 [ 0.0613258   0.95575305  0.57256438 -2.18571073  0.08369243 -0.26954473
   1.02159193  0.10833606]]

y:
[[ 2.62007564  2.31142849  3.05795361  1.919044    0.23984905 -2.65436575
  -1.39526718 -1.13371737]]

dy:
[[-0.86079817 -0.06423127 -0.26390105 -0.94597227  1.18503516 -0.51447055
  -0.25379368 -1.08943648]]

dx:
[[-0.86079817 -0.06423127 -0.26390105 -0.94597227  1.18503516 -0.51447055

## 1.3.4.5 MatMul

In [5]:
class MatMul:
    def __init__(self, W):
        self.params = [W]
        self.grads = [np.zeros_like(W)]
        self.x = None

    def forward(self, x):
        W, = self.params
        out = np.dot(x, W)
        self.x = x
        return out

    def backward(self, dout):
        W, = self.params
        dx = np.dot(dout, W.T)
        dW = np.dot(self.x.T, dout)
        self.grads[0][...] = dW
        return dx

# 1.3.5 Backward Propagation

## 1.3.5.1 Sigmoid 

In [6]:
class Sigmoid:
    def __init__(self):
        self.params = list()
        self.grads = list()
        self.out = None

    def forward(self, x):
        out = 1 / (1 + np.exp(-x))
        self.out = out
        return out

    def backward(self, dout):
        dx = dout * (1.0 - self.out) * self.out
        return dx

## 1.3.5.2 Affine 

In [7]:
class Affine:
    def __init__(self, W, b):
        self.params = [W, b]
        self.grads = [np.zeros_like(W), np.zeros_like(b)]
        self.x = None

    def forward(self, x):
        W, b = self.params
        out = np.dot(x, W) + b
        self.x = x
        return out

    def backward(self, dout):
        W, b = self.params
        dx = np.dot(dout, W.T)
        dW = np.dot(self.x.T, dout)
        db = np.sum(dout, axis=0)

        self.grads[0][...] = dW
        self.grads[1][...] = db
        return dx

## 1.3.5.3 Softmax with Loss

In [8]:
class SoftmaxWithLoss:
    def __init__(self):
        self.params = list()
        self.grads = list()
        self.y = None  
        self.t = None  

    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)

        if self.t.size == self.y.size:
            self.t = self.t.argmax(axis=1)

        loss = cross_entropy_error(self.y, self.t)
        return loss

    def backward(self, dout=1):
        batch_size = self.t.shape[0]

        dx = self.y.copy()
        dx[np.arange(batch_size), self.t] -= 1
        dx *= dout
        dx = dx / batch_size

        return dx

# 1.3.6 Updating Weight

In [9]:
class SGD:
    def __init__(self, lr=0.01):
        self.lr = lr
        
    def update(self, params, grads):
        for i in range(len(params)):
            params[i] -= self.lr * grads[i]

In [10]:
class Momentum:
    def __init__(self, lr=0.01, momentum=0.9):
        self.lr = lr
        self.momentum = momentum
        self.v = None
        
    def update(self, params, grads):
        if self.v is None:
            self.v = []
            for param in params:
                self.v.append(np.zeros_like(param))

        for i in range(len(params)):
            self.v[i] = self.momentum * self.v[i] - self.lr * grads[i]
            params[i] += self.v[i]

In [11]:
class Nesterov:
    def __init__(self, lr=0.01, momentum=0.9):
        self.lr = lr
        self.momentum = momentum
        self.v = None
        
    def update(self, params, grads):
        if self.v is None:
            self.v = []
            for param in params:
                self.v.append(np.zeros_like(param))

        for i in range(len(params)):
            self.v[i] *= self.momentum
            self.v[i] -= self.lr * grads[i]
            params[i] += self.momentum * self.momentum * self.v[i]
            params[i] -= (1 + self.momentum) * self.lr * grads[i]

In [12]:
class AdaGrad:
    def __init__(self, lr=0.01):
        self.lr = lr
        self.h = None
        
    def update(self, params, grads):
        if self.h is None:
            self.h = []
            for param in params:
                self.h.append(np.zeros_like(param))

        for i in range(len(params)):
            self.h[i] += grads[i] * grads[i]
            params[i] -= self.lr * grads[i] / (np.sqrt(self.h[i]) + 1e-7)

In [13]:
class RMSprop:
    def __init__(self, lr=0.01, decay_rate = 0.99):
        self.lr = lr
        self.decay_rate = decay_rate
        self.h = None
        
    def update(self, params, grads):
        if self.h is None:
            self.h = []
            for param in params:
                self.h.append(np.zeros_like(param))

        for i in range(len(params)):
            self.h[i] *= self.decay_rate
            self.h[i] += (1 - self.decay_rate) * grads[i] * grads[i]
            params[i] -= self.lr * grads[i] / (np.sqrt(self.h[i]) + 1e-7)

In [14]:
class Adam:
    def __init__(self, lr=0.001, beta1=0.9, beta2=0.999):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.iter = 0
        self.m = None
        self.v = None
        
    def update(self, params, grads):
        if self.m is None:
            self.m, self.v = [], []
            for param in params:
                self.m.append(np.zeros_like(param))
                self.v.append(np.zeros_like(param))
        
        self.iter += 1
        lr_t = self.lr * np.sqrt(1.0 - self.beta2**self.iter) / (1.0 - self.beta1**self.iter)

        for i in range(len(params)):
            self.m[i] += (1 - self.beta1) * (grads[i] - self.m[i])
            self.v[i] += (1 - self.beta2) * (grads[i]**2 - self.v[i])            
            params[i] -= lr_t * self.m[i] / (np.sqrt(self.v[i]) + 1e-7)