In [1]:
import numpy as np

In [2]:
def softmax_function(x):
    return np.exp(x) / np.sum(np.exp(x))

def sigmoid_function(x):
    return 1/ (1 + np.exp(-x))

def dfunc(f, x):
    h = 1e-4 
    grad = np.zeros_like(x)
    it = np.nditer(x, flags=['multi_index'])
    while not it.finished:     
        idx = it.multi_index
        tmp_val = x[idx]
        x[idx] = float(tmp_val) + h
        fxh1 = f(x) # f(x+h)
        x[idx] = tmp_val - h 
        fxh2 = f(x) # f(x-h)
        grad[idx] = (fxh1 - fxh2) / (2*h)
        x[idx] = tmp_val
        it.iternext()   
    return grad

def cross_entropy_err(y_hat, y):
    delta = 1e-8
    return -np.sum(y*np.log(y_hat + delta))

In [3]:
class ANN:
    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)
 
    def predict(self, x):
        W1, W2 = self.params['W1'], self.params['W2']
        b1, b2 = self.params['b1'], self.params['b2']
        a1 = np.dot(x, W1) + b1
        z1 = sigmoid_function(a1)
        a2 = np.dot(z1, W2) + b2
        y = softmax_function(a2)
        return y
        
    def loss(self, x, y):
        y_hat = self.predict(x)
        return cross_entropy_err(y_hat, y)
        
    def numerical_gradient(self, x, y):
        loss_W = lambda W: self.loss(x, y)
        grads = {}
        grads['W1'] = dfunc(loss_W, self.params['W1'])
        grads['b1'] = dfunc(loss_W, self.params['b1'])
        grads['W2'] = dfunc(loss_W, self.params['W2'])
        grads['b2'] = dfunc(loss_W, self.params['b2'])
        return grads

In [4]:
net = ANN(input_size=4, hidden_size=5, output_size=3)

In [5]:
net.params['W1']

array([[-0.00999212,  0.00023952,  0.01225109, -0.00578119, -0.00530105],
       [ 0.01837809, -0.0034991 ,  0.00137951, -0.01298947,  0.00884884],
       [-0.01104285,  0.01123044,  0.00848096,  0.00127694,  0.00101045],
       [ 0.00277641,  0.0043066 ,  0.00626061, -0.01373687, -0.00482733]])

In [6]:
from sklearn.datasets import load_iris
iris = load_iris()
x= iris.data

In [7]:
input_x = np.array([[5.1, 3.5, 1.4, 0.2]])
net.predict(input_x)

array([[0.33370794, 0.33434379, 0.33194826]])

In [8]:
y = np.zeros((len(iris.target), 3))
for idx, val in enumerate(iris.target):
  y[idx, val] = 1

In [9]:
output_y = np.array([1., 0., 0.])

In [10]:
net.loss(input_x,output_y)

1.0974890652248435

In [11]:
import time
epochs = 3000
lr   = 0.01
a = time.clock()
train_loss = []
for i in range(epochs): 
  grad = net.numerical_gradient(x,y)
  for key in ('W1', 'b1', 'W2', 'b2'):
    net.params[key] = net.params[key] - lr * grad[key]
  loss = net.loss(x, y)
  train_loss.append(loss)
print(time.clock() -a )

  after removing the cwd from sys.path.


21.684755000000003


  if sys.path[0] == '':


## Back Propagation

In [12]:
class Relu:
  def __init__(self):
    self.cache = None
    
  def forward(self, x):
    self.cache = (x <=0)
    out = np.maximum(0,x)
    return out

  def backward(self, dout):
    dout[self.cache] = 0
    dx = dout
    return dx

In [13]:
f = Relu()

In [14]:
x = np.array([-1,1,2])
f.forward(x)

array([0, 1, 2])

In [15]:
f.cache

array([ True, False, False])

In [16]:
out = np.array([2,2,2])

In [17]:
f.backward(out)

array([0, 2, 2])

- https://www.wolframalpha.com/input/?i=derivative+1%2Fx

In [18]:
class Sigmoid:
    def __init__(self):
        self.out = None

    def forward(self, x):
        out = 1/ (1 + np.exp(-x))
        self.out = out
        return out

    def backward(self, dout):
        y  = self.out
        dx = dout * y * (1-y)
        return dx

In [19]:
s = Sigmoid()

In [20]:
ary = np.array([-1,1,2])
s.forward(ary)

array([0.26894142, 0.73105858, 0.88079708])

In [21]:
out = np.array([2,2,2])
s.backward(out)

array([0.39322387, 0.39322387, 0.20998717])

In [22]:
2 * 0.26894142 * (1- 0.26894142)

0.39322386521676717

In [23]:
class Affine:
    def __init__(self, W, b):
        self.W = W
        self.b = b
        self.x = None
        self.dW = None
        self.db = None

    def forward(self, x):
        self.x = x
        out = np.dot(self.x, self.W) + self.b
        return out

    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout)
        return dx

In [24]:
class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None
        self.y_hat = None 
        self.y = None 

    def forward(self, x, y):
        self.y = y
        self.y_hat = softmax_function(x)
        self.loss = cross_entropy_err(self.y_hat, self.y)
        return self.loss

    def backward(self, dout=1):
        batch_size = self.y.shape[0]
        dx = (self.y_hat - self.y) / batch_size
        return dx

In [25]:
import numpy as np
from collections import OrderedDict

class ANN:

    def __init__(self, input_size, hidden_size, output_size, weight_init_std = 0.01):
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size) 
        self.params['b2'] = np.zeros(output_size)

        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
        self.layers['Sigmoid1'] = Sigmoid()
        self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])

        self.lastLayer = SoftmaxWithLoss()
        
    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)
        return x
        
    def loss(self, x, y):
        y_hat = self.predict(x)
        return self.lastLayer.forward(y_hat, y)
  
    def gradient(self, x, y):
        # forward
        self.loss(x, y)

        # backward
        dout = 1
        dout = self.lastLayer.backward(dout)
        
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)

        grads = {}
        grads['W1'], grads['b1'] = self.layers['Affine1'].dW, self.layers['Affine1'].db
        grads['W2'], grads['b2'] = self.layers['Affine2'].dW, self.layers['Affine2'].db

        return grads

In [26]:
import random
import time

epochs     = 3000
lr         = 0.01
batch_size = 30

net = ANN(input_size=4, hidden_size=5, output_size=3)

t0 = time.clock()
train_loss = []
for i in range(epochs): 
    idx = np.random.choice(iris.data.shape[0], batch_size)
    x_batch   = iris.data[idx]
    y_batch   = y[idx]
    grad = net.gradient(x_batch,y_batch)
    for key in ('W1', 'b1', 'W2', 'b2'):
        net.params[key] -= lr * grad[key]
    loss = net.loss(x_batch, y_batch)
    train_loss.append(loss)
    
time.clock() - t0

  # Remove the CWD from sys.path while we load stuff.


0.5818330000000032