# 1.3.1 Loss Function

In [1]:
class Softmax:
    def __init__(self):
        self.params = list() 
        self.grads = list()
        self.out = None

    def forward(self, x):
        self.out = softmax(x)
        return self.out

    def backward(self, dout):
        dx = self.out * dout
        sumdx = np.sum(dx, axis=1, keepdims=True)
        dx -= self.out * sumdx
        return dx

# 1.3.2 Gradient

$ \frac{\partial L}{\partial X} = (\frac{\partial L}{\partial x_1}, \frac{\partial L}{\partial x_2}... \frac{\partial L}{\partial x_n})$

# 1.3.3 Chain Lule

$ \frac{\partial X}{\partial Z} = \frac{\partial X}{\partial Y} \frac{\partial Y}{\partial Z}$

# 1.3.4 Computation Graph

## 1.3.4.1 Multiplication

## 1.3.4.2 Branch

## 1.3.4.3 Repeat

In [2]:
import numpy as np

In [3]:
D = 8
N = 7
x = np.random.randn(1, D)
y = np.repeat(x, N, axis=0) # forward
dy = np.random.randn(N, D) # gradient
dx = np.sum(dy, axis=0, keepdims=True) # backward
print("x:\n{}\n".format(x))
print("y:\n{}\n".format(y))
print("dy:\n{}\n".format(dy))
print("dx:\n{}\n".format(dx))

x:
[[ 1.28405252  0.58775303 -0.32320597 -0.24592034 -1.58843073  0.71912418
   0.21437496  0.10524601]]

y:
[[ 1.28405252  0.58775303 -0.32320597 -0.24592034 -1.58843073  0.71912418
   0.21437496  0.10524601]
 [ 1.28405252  0.58775303 -0.32320597 -0.24592034 -1.58843073  0.71912418
   0.21437496  0.10524601]
 [ 1.28405252  0.58775303 -0.32320597 -0.24592034 -1.58843073  0.71912418
   0.21437496  0.10524601]
 [ 1.28405252  0.58775303 -0.32320597 -0.24592034 -1.58843073  0.71912418
   0.21437496  0.10524601]
 [ 1.28405252  0.58775303 -0.32320597 -0.24592034 -1.58843073  0.71912418
   0.21437496  0.10524601]
 [ 1.28405252  0.58775303 -0.32320597 -0.24592034 -1.58843073  0.71912418
   0.21437496  0.10524601]
 [ 1.28405252  0.58775303 -0.32320597 -0.24592034 -1.58843073  0.71912418
   0.21437496  0.10524601]]

dy:
[[ 2.26892824e-01  8.78954825e-02  9.95837408e-01 -1.23953429e+00
  -1.96653140e-01 -5.65898803e-01  1.11190767e+00  1.10404209e-01]
 [-5.75998879e-01  4.07007862e-01  7.94090381

## 1.3.4.4 Sum

In [4]:
D = 8
N = 7
x = np.random.randn(N, D)
y = np.sum(x, axis=0, keepdims=True) # forward
dy = np.random.randn(1, D) # gradient
dx = np.repeat(dy, N, axis=0) # backward
print("x:\n{}\n".format(x))
print("y:\n{}\n".format(y))
print("dy:\n{}\n".format(dy))
print("dx:\n{}\n".format(dx))

x:
[[-7.83305840e-01  1.92923045e+00  1.01195494e+00 -2.48230715e+00
  -4.64231228e-01  5.91175188e-01 -1.05315748e+00 -2.43263158e-01]
 [ 4.05977843e-01 -1.29147865e-01 -6.43595989e-01 -3.56481684e-01
   6.20339673e-01 -9.44876426e-01  1.25960861e-01 -1.21731849e+00]
 [ 9.37413472e-01 -1.22735298e-01  8.84076947e-01 -2.21324316e+00
   5.83968267e-01 -5.52866284e-01 -4.12653854e-01  6.11186712e-01]
 [ 4.36352801e-01  8.65204086e-02  2.38662097e+00  1.85428891e+00
  -6.86274353e-01  1.34136547e+00 -8.79451006e-02  4.47950860e-01]
 [ 8.97627315e-01 -1.01200808e+00  1.03082072e+00 -4.54829598e-01
  -2.60245819e+00 -3.38569064e-01 -1.45478630e+00  8.42820579e-01]
 [ 1.39205722e+00  6.04158962e-01  1.74783174e+00  4.62525015e-01
  -1.70016011e-01  3.12720185e-01 -1.84952344e+00 -7.45711631e-01]
 [-2.17191295e-01 -3.59215003e-02 -8.34022953e-02 -1.43487915e+00
  -2.13531550e+00 -1.56574643e+00  4.76681662e-04  7.08486003e-01]]

y:
[[ 3.06893152  1.32009708  6.33430703 -4.62492682 -4.85398734

## 1.3.4.5 MatMul

In [5]:
class MatMul:
    def __init__(self, W):
        self.params = [W]
        self.grads = [np.zeros_like(W)]
        self.x = None

    def forward(self, x):
        W, = self.params
        out = np.dot(x, W)
        self.x = x
        return out

    def backward(self, dout):
        W, = self.params
        dx = np.dot(dout, W.T)
        dW = np.dot(self.x.T, dout)
        self.grads[0][...] = dW
        return dx

# 1.3.5 Backward Propagation

## 1.3.5.1 Sigmoid 

In [6]:
class Sigmoid:
    def __init__(self):
        self.params = list()
        self.grads = list()
        self.out = None

    def forward(self, x):
        out = 1 / (1 + np.exp(-x))
        self.out = out
        return out

    def backward(self, dout):
        dx = dout * (1.0 - self.out) * self.out
        return dx

## 1.3.5.2 Affine 

In [7]:
class Affine:
    def __init__(self, W, b):
        self.params = [W, b]
        self.grads = [np.zeros_like(W), np.zeros_like(b)]
        self.x = None

    def forward(self, x):
        W, b = self.params
        out = np.dot(x, W) + b
        self.x = x
        return out

    def backward(self, dout):
        W, b = self.params
        dx = np.dot(dout, W.T)
        dW = np.dot(self.x.T, dout)
        db = np.sum(dout, axis=0)

        self.grads[0][...] = dW
        self.grads[1][...] = db
        return dx

## 1.3.5.3 Softmax with Loss

In [8]:
class SoftmaxWithLoss:
    def __init__(self):
        self.params = list()
        self.grads = list()
        self.y = None  
        self.t = None  

    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)

        if self.t.size == self.y.size:
            self.t = self.t.argmax(axis=1)

        loss = cross_entropy_error(self.y, self.t)
        return loss

    def backward(self, dout=1):
        batch_size = self.t.shape[0]

        dx = self.y.copy()
        dx[np.arange(batch_size), self.t] -= 1
        dx *= dout
        dx = dx / batch_size

        return dx

# 1.3.6 Updating Weight

In [9]:
class SGD:
    def __init__(self, lr=0.01):
        self.lr = lr
        
    def update(self, params, grads):
        for i in range(len(params)):
            params[i] -= self.lr * grads[i]

In [10]:
class Momentum:
    def __init__(self, lr=0.01, momentum=0.9):
        self.lr = lr
        self.momentum = momentum
        self.v = None
        
    def update(self, params, grads):
        if self.v is None:
            self.v = []
            for param in params:
                self.v.append(np.zeros_like(param))

        for i in range(len(params)):
            self.v[i] = self.momentum * self.v[i] - self.lr * grads[i]
            params[i] += self.v[i]

In [11]:
class Nesterov:
    def __init__(self, lr=0.01, momentum=0.9):
        self.lr = lr
        self.momentum = momentum
        self.v = None
        
    def update(self, params, grads):
        if self.v is None:
            self.v = []
            for param in params:
                self.v.append(np.zeros_like(param))

        for i in range(len(params)):
            self.v[i] *= self.momentum
            self.v[i] -= self.lr * grads[i]
            params[i] += self.momentum * self.momentum * self.v[i]
            params[i] -= (1 + self.momentum) * self.lr * grads[i]

In [12]:
class AdaGrad:
    def __init__(self, lr=0.01):
        self.lr = lr
        self.h = None
        
    def update(self, params, grads):
        if self.h is None:
            self.h = []
            for param in params:
                self.h.append(np.zeros_like(param))

        for i in range(len(params)):
            self.h[i] += grads[i] * grads[i]
            params[i] -= self.lr * grads[i] / (np.sqrt(self.h[i]) + 1e-7)

In [13]:
class RMSprop:
    def __init__(self, lr=0.01, decay_rate = 0.99):
        self.lr = lr
        self.decay_rate = decay_rate
        self.h = None
        
    def update(self, params, grads):
        if self.h is None:
            self.h = []
            for param in params:
                self.h.append(np.zeros_like(param))

        for i in range(len(params)):
            self.h[i] *= self.decay_rate
            self.h[i] += (1 - self.decay_rate) * grads[i] * grads[i]
            params[i] -= self.lr * grads[i] / (np.sqrt(self.h[i]) + 1e-7)

In [14]:
class Adam:
    def __init__(self, lr=0.001, beta1=0.9, beta2=0.999):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.iter = 0
        self.m = None
        self.v = None
        
    def update(self, params, grads):
        if self.m is None:
            self.m, self.v = [], []
            for param in params:
                self.m.append(np.zeros_like(param))
                self.v.append(np.zeros_like(param))
        
        self.iter += 1
        lr_t = self.lr * np.sqrt(1.0 - self.beta2**self.iter) / (1.0 - self.beta1**self.iter)

        for i in range(len(params)):
            self.m[i] += (1 - self.beta1) * (grads[i] - self.m[i])
            self.v[i] += (1 - self.beta2) * (grads[i]**2 - self.v[i])            
            params[i] -= lr_t * self.m[i] / (np.sqrt(self.v[i]) + 1e-7)