In [33]:
import numpy as np 
import _pickle as cPickle
import gzip
import os
from sklearn.utils import shuffle
from tqdm import tqdm

In [34]:
# utility functions

def one_hot_encoded(y, num_class):
    n = y.shape[0]
    onehot = np.zeros((n, num_class), dtype="int32")
    for i in range(n):
        idx = y[i]
        onehot[i][idx] = 1
    return onehot


def check_accuracy(y_true, y_pred):
    return np.mean(y_pred == y_true)  # both are not one hot encoded


def softmax(x):
    max_x = np.max(x, axis=1, keepdims=True)
    exp_x = np.exp(x - max_x)
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)
        
# l2 regularization
def l2_reg(layers, lam=0.001):
    reg_loss = 0.0
    for layer in layers:
        if hasattr(layer, 'W'):
            reg_loss += 0.5 * lam * np.sum(layer.W * layer.W)
    return reg_loss


# l2 regularization grad
def delta_l2_reg(layers, grads, lam=0.001):
    for layer, grad in zip(layers, reversed(grads)):
        if hasattr(layer, 'W'):
            grad[0] += lam * layer.W
    return grads


In [35]:
softmax(np.array([[1,2,3]]) - 1)

array([[0.09003057, 0.24472847, 0.66524096]])

In [36]:
def eval_numerical_gradient(f, x, verbose=False, h=0.00001):
    """Evaluates gradient df/dx via finite differences:
    df/dx ~ (f(x+h) - f(x-h)) / 2h
    Adopted from https://github.com/ddtm/dl-course/
    """
    fx = f(x) # evaluate function value at original point
    grad = np.zeros_like(x)
    # iterate over all indexes in x
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:

        # evaluate function at x+h
        ix = it.multi_index
        oldval = x[ix]
        x[ix] = oldval + h # increment by h
        fxph = f(x) # evalute f(x + h)
        x[ix] = oldval - h
        fxmh = f(x) # evaluate f(x - h)
        x[ix] = oldval # restore

        # compute the partial derivative with centered formula
        grad[ix] = (fxph - fxmh) / (2 * h) # the slope
        if verbose:
            print (ix, grad[ix])
        it.iternext() # step to next dimension

    return grad

In [37]:
class ReLU():
    def __init__(self):
        self.params = []
        self.gradInput = None

    def forward(self, X, mode):
        self.X = X
        return np.maximum(X, 0)
    
    def backward(self, dout, mode):
        self.gradInput = dout.copy()
        self.gradInput[self.X <= 0] = 0
        return self.gradInput, []

In [38]:
points = np.linspace(-1, 1, 10*12).reshape([10, 12])
relu = ReLU()
f = lambda x: relu.forward(x, mode='train').sum(axis=1).sum()
res = f(points)
numeric_grads = eval_numerical_gradient(f, points)
print(numeric_grads)
inp_grad = np.ones(shape=(10, 12))
grads = relu.backward(inp_grad, mode='train')[0]
assert np.allclose(grads, numeric_grads, rtol=1e-3, atol=0)

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]


In [39]:
class Linear():

    def __init__(self, in_size, out_size):

        # Xavier init
        self.W = np.random.randn(in_size, out_size) / np.sqrt(in_size + out_size/ 2.)
        self.b = np.zeros((1, out_size))
        self.params = [self.W, self.b]
        self.gradW = None
        self.gradB = None
        self.gradInput = None

    def forward(self, X, mode):
        self.X = X
        return X.dot(self.W) + self.b
    
    def backward(self, dout, mode):
        self.gradW = self.X.T.dot(dout)        
        self.gradInput = dout.dot(self.W.T)
        self.gradB = dout.sum(axis=0)
        return self.gradInput, [self.gradW, self.gradB]

In [40]:
points = np.linspace(-1, 1, 10*12).reshape([10, 12])
relu = Linear(12, 5)
f = lambda x: relu.forward(x, mode='train').sum(axis=1).sum()
res = f(points)
numeric_grads = eval_numerical_gradient(f, points)
print(numeric_grads)
inp_grad = np.ones(shape=(10, 5))
grads = relu.backward(inp_grad, mode='train')[0]
assert np.allclose(grads, numeric_grads, rtol=1e-3, atol=0)

[[-0.06351219  0.03755917  0.90200542 -0.11404127  1.21183226  1.45712454
  -0.04556915  0.02058435 -0.74234317  0.31638473 -0.01036975  0.18493633]
 [-0.06351219  0.03755917  0.90200542 -0.11404127  1.21183226  1.45712454
  -0.04556915  0.02058435 -0.74234317  0.31638473 -0.01036975  0.18493633]
 [-0.06351219  0.03755917  0.90200542 -0.11404127  1.21183226  1.45712454
  -0.04556915  0.02058435 -0.74234317  0.31638473 -0.01036975  0.18493633]
 [-0.06351219  0.03755917  0.90200542 -0.11404127  1.21183226  1.45712454
  -0.04556915  0.02058435 -0.74234317  0.31638473 -0.01036975  0.18493633]
 [-0.06351219  0.03755917  0.90200542 -0.11404127  1.21183226  1.45712454
  -0.04556915  0.02058435 -0.74234317  0.31638473 -0.01036975  0.18493633]
 [-0.06351219  0.03755917  0.90200542 -0.11404127  1.21183226  1.45712454
  -0.04556915  0.02058435 -0.74234317  0.31638473 -0.01036975  0.18493633]
 [-0.06351219  0.03755917  0.90200542 -0.11404127  1.21183226  1.45712454
  -0.04556915  0.02058435 -0.742

In [41]:
class CrossEntropyLoss(object):

    def forward(self, X, y):
        self.m = y.shape[0] # self.m == X.shape[0], y.shape == (X.shape[0], )
        self.p = softmax(X)
        cross_entropy = -np.log(self.p[range(self.m), y])
        loss = np.sum(cross_entropy) / self.m
        return loss
    
    def backward(self, X, y):
        dx = self.p.copy()
        dx[range(self.m), y] -= 1
        dx /= self.m
        return dx

## NN implementation

In [42]:
class NN:

    def __init__(self, loss_func=CrossEntropyLoss(), mode = 'train'):
    
        self.layers = []
        self.params = []
        self.loss_func = loss_func
        self.grads = []
        self.mode = mode

    def add_layer(self,layer):
        self.layers.append(layer)
        self.params.append(layer.params)

    def forward(self, X):
        for layer in self.layers:
            X = layer.forward(X, self.mode)
        return X

    def backward(self, dout):
        self.clear_grad_param()
        for layer in reversed(self.layers):
            dout, grad = layer.backward(dout, self.mode)
            self.grads.append(grad)
        return self.grads

    def train_step(self, X, y):
        out = self.forward(X)
        loss = self.loss_func.forward(out,y)
        dout = self.loss_func.backward(out,y)
        loss += l2_reg(self.layers)
        grads = self.backward(dout)
        grads = delta_l2_reg(self.layers, grads)
        return loss, grads

    def predict(self, X):
        X = self.forward(X)
        return np.argmax(softmax(X), axis=1)


    def dispGradParam():
        print(self.grads)
    

    def clear_grad_param(self):
        self.grads = []

In [43]:
# SGD with momentum
def update(velocity, params, grads, learning_rate=0.001, mu=0.9):
    for v, p, g, in zip(velocity, params, reversed(grads)):
        for i in range(len(g)):
            v[i] = mu * v[i] + learning_rate * g[i]
            p[i] -= v[i]


# get minibatches
def minibatch(X, y, minibatch_size):
    n = X.shape[0]
    minibatches = []
    X, y = shuffle(X, y)

    for i in range(0, n , minibatch_size):
        X_batch = X[i:i + minibatch_size, ...]
        y_batch = y[i:i + minibatch_size, ...]

        minibatches.append((X_batch, y_batch))
    return minibatches


In [44]:
def train(net, X_train, y_train, minibatch_size, epoch, learning_rate, mu=0.9,
                 verbose=True, X_val=None, y_val=None, nesterov=True):
    val_loss_epoch = []
    minibatches = minibatch(X_train, y_train, minibatch_size)
    minibatches_val = minibatch(X_val, y_val, minibatch_size)

    c = 0 
    for i in range(epoch):
        loss_batch = []
        val_loss_batch = []
        velocity = []
        for param_layer in net.params:
            p = [np.zeros_like(param) for param in list(param_layer)]
            velocity.append(p)

        if verbose:
            print("Epoch {0}".format(i + 1))

        # iterate over mini batches
        for X_mini, y_mini in tqdm(minibatches):

            loss, grads = net.train_step(X_mini, y_mini)
            loss_batch.append(loss)
            update(velocity, net.params, grads,
                            learning_rate=learning_rate, mu=mu)

        for X_mini_val, y_mini_val in tqdm(minibatches_val):
            val_loss, _ = net.train_step(X_mini, y_mini)
            val_loss_batch.append(val_loss)


        # accuracy of model at end of epoch after all mini batch updates   

        if verbose:
            m_train = X_train.shape[0]
            m_val = X_val.shape[0]
            y_train_pred = np.array([], dtype="int64")
            y_val_pred = np.array([], dtype="int64")

            for i in range(0, m_train, minibatch_size):
                X_tr = X_train[i:i + minibatch_size, : ]
                y_tr = y_train[i:i + minibatch_size, ]
                y_train_pred = np.append(y_train_pred, net.predict(X_tr))

            for i in range(0, m_val, minibatch_size):
                X_va = X_val[i:i + minibatch_size, : ]
                y_va = y_val[i:i + minibatch_size, ]
                y_val_pred = np.append(y_val_pred, net.predict(X_va))

            train_acc = check_accuracy(y_train, y_train_pred)
            val_acc = check_accuracy(y_val, y_val_pred)

            mean_train_loss = sum(loss_batch) / float(len(loss_batch))
            mean_val_loss = sum(val_loss_batch) / float(len(val_loss_batch))


            # early stopping with patience = 5 on val loss

            if len(val_loss_epoch) == 0:
                val_loss_epoch.append(mean_val_loss)
            else:
                for j in val_loss_epoch[-5:]:
                    if mean_val_loss > j:
                        c += 1
                    else:
                        c = 0
                if c > 5:
                    print('Early stopping')
                    return net
                else:
                    c = 0
                    val_loss_epoch.append(mean_val_loss)    


            print("Loss = {0} | Training Accuracy = {1} | Val Loss = {2} | Val Accuracy = {3}".format(
                mean_train_loss, train_acc, mean_val_loss, val_acc))
    return net


In [45]:

# Get preprocessed training and validation data

X_train = np.array([
    [1, 2, 1, 2],
    [2, 4, 2, 4],
    [2, 1, 2, 1],
    [4, 2, 4, 2],
])

y_train = np.array([0, 1, 0, 1])
X_val = X_train.copy()
y_val = y_train.copy()

print(X_train.shape)
print(X_val.shape)


# define neural net
model = NN()

# add some layers
model.add_layer(Linear(X_train.shape[1], 100))
model.add_layer(ReLU())
model.add_layer(Linear(100, 100))
model.add_layer(ReLU())
model.add_layer(Linear(100, 2))

model = train(model, X_train , y_train, minibatch_size=4, epoch=100,
           learning_rate=0.1, X_val=X_val, y_val=y_val)




100%|██████████| 1/1 [00:00<00:00, 28.58it/s]
100%|██████████| 1/1 [00:00<00:00, 44.69it/s]
100%|██████████| 1/1 [00:00<00:00, 93.09it/s]
100%|██████████| 1/1 [00:00<00:00, 41.53it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

(4, 4)
(4, 4)
Epoch 1
Loss = 0.6847141240483717 | Training Accuracy = 0.5 | Val Loss = 0.675468781449579 | Val Accuracy = 0.5
Epoch 2
Loss = 0.675468781449579 | Training Accuracy = 0.5 | Val Loss = 0.6690872950978174 | Val Accuracy = 0.5
Epoch 3


100%|██████████| 1/1 [00:00<00:00, 43.30it/s]
100%|██████████| 1/1 [00:00<00:00, 41.18it/s]
100%|██████████| 1/1 [00:00<00:00, 39.47it/s]
100%|██████████| 1/1 [00:00<00:00, 43.56it/s]
100%|██████████| 1/1 [00:00<00:00, 43.14it/s]
100%|██████████| 1/1 [00:00<00:00, 43.96it/s]
100%|██████████| 1/1 [00:00<00:00, 43.14it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

Loss = 0.6690872950978174 | Training Accuracy = 0.5 | Val Loss = 0.6627959585567144 | Val Accuracy = 0.5
Epoch 4
Loss = 0.6627959585567144 | Training Accuracy = 0.5 | Val Loss = 0.6565707991653421 | Val Accuracy = 0.5
Epoch 5
Loss = 0.6565707991653421 | Training Accuracy = 0.5 | Val Loss = 0.6504001709657109 | Val Accuracy = 0.5
Epoch 6


100%|██████████| 1/1 [00:00<00:00, 40.77it/s]
100%|██████████| 1/1 [00:00<00:00, 42.16it/s]
100%|██████████| 1/1 [00:00<00:00, 42.90it/s]
100%|██████████| 1/1 [00:00<00:00, 42.30it/s]
100%|██████████| 1/1 [00:00<00:00, 42.22it/s]
100%|██████████| 1/1 [00:00<00:00, 43.23it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

Loss = 0.6504001709657109 | Training Accuracy = 0.5 | Val Loss = 0.6442802421362464 | Val Accuracy = 0.5
Epoch 7
Loss = 0.6442802421362464 | Training Accuracy = 0.5 | Val Loss = 0.6381843849785025 | Val Accuracy = 0.5
Epoch 8
Loss = 0.6381843849785025 | Training Accuracy = 0.5 | Val Loss = 0.6321222500579095 | Val Accuracy = 0.5
Epoch 9


100%|██████████| 1/1 [00:00<00:00, 43.67it/s]
100%|██████████| 1/1 [00:00<00:00, 43.72it/s]
100%|██████████| 1/1 [00:00<00:00, 42.51it/s]
100%|██████████| 1/1 [00:00<00:00, 43.89it/s]
100%|██████████| 1/1 [00:00<00:00, 42.38it/s]
100%|██████████| 1/1 [00:00<00:00, 40.93it/s]

Loss = 0.6321222500579095 | Training Accuracy = 0.5 | Val Loss = 0.6260909800175898 | Val Accuracy = 0.5
Epoch 10
Loss = 0.6260909800175898 | Training Accuracy = 0.5 | Val Loss = 0.6200941629344972 | Val Accuracy = 0.5
Epoch 11
Loss = 0.6200941629344972 | Training Accuracy = 0.5 | Val Loss = 0.6141272329730562 | Val Accuracy = 0.5
Epoch 12



100%|██████████| 1/1 [00:00<00:00, 42.05it/s]
100%|██████████| 1/1 [00:00<00:00, 45.73it/s]
100%|██████████| 1/1 [00:00<00:00, 75.03it/s]
100%|██████████| 1/1 [00:00<00:00, 117.51it/s]
100%|██████████| 1/1 [00:00<00:00, 46.51it/s]
100%|██████████| 1/1 [00:00<00:00, 48.15it/s]
100%|██████████| 1/1 [00:00<00:00, 513.32it/s]
100%|██████████| 1/1 [00:00<00:00, 174.68it/s]
100%|██████████| 1/1 [00:00<00:00, 111.10it/s]
100%|██████████| 1/1 [00:00<00:00, 210.49it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

Loss = 0.6141272329730562 | Training Accuracy = 0.5 | Val Loss = 0.6081905817198308 | Val Accuracy = 0.5
Epoch 13
Loss = 0.6081905817198308 | Training Accuracy = 0.5 | Val Loss = 0.6022678969802266 | Val Accuracy = 0.5
Epoch 14
Loss = 0.6022678969802266 | Training Accuracy = 0.5 | Val Loss = 0.5963631524270542 | Val Accuracy = 0.5
Epoch 15
Loss = 0.5963631524270542 | Training Accuracy = 0.5 | Val Loss = 0.5904729939702433 | Val Accuracy = 0.5
Epoch 16
Loss = 0.5904729939702433 | Training Accuracy = 0.5 | Val Loss = 0.5845944494627041 | Val Accuracy = 0.5
Epoch 17


100%|██████████| 1/1 [00:00<00:00, 72.08it/s]
100%|██████████| 1/1 [00:00<00:00, 247.89it/s]
100%|██████████| 1/1 [00:00<00:00, 295.69it/s]
100%|██████████| 1/1 [00:00<00:00, 94.09it/s]
100%|██████████| 1/1 [00:00<00:00, 146.20it/s]
100%|██████████| 1/1 [00:00<00:00, 111.67it/s]
100%|██████████| 1/1 [00:00<00:00, 112.35it/s]
100%|██████████| 1/1 [00:00<00:00, 121.27it/s]
100%|██████████| 1/1 [00:00<00:00, 142.17it/s]
100%|██████████| 1/1 [00:00<00:00, 228.88it/s]
100%|██████████| 1/1 [00:00<00:00, 436.91it/s]
100%|██████████| 1/1 [00:00<00:00, 103.44it/s]

Loss = 0.5845944494627041 | Training Accuracy = 0.5 | Val Loss = 0.5787247783386 | Val Accuracy = 0.5
Epoch 18
Loss = 0.5787247783386 | Training Accuracy = 0.5 | Val Loss = 0.5728616030813889 | Val Accuracy = 0.5
Epoch 19
Loss = 0.5728616030813889 | Training Accuracy = 0.5 | Val Loss = 0.5670027279624712 | Val Accuracy = 0.5
Epoch 20
Loss = 0.5670027279624712 | Training Accuracy = 0.5 | Val Loss = 0.5611451861476011 | Val Accuracy = 0.5
Epoch 21
Loss = 0.5611451861476011 | Training Accuracy = 0.5 | Val Loss = 0.5552875242588289 | Val Accuracy = 0.5
Epoch 22
Loss = 0.5552875242588289 | Training Accuracy = 0.5 | Val Loss = 0.5494279108907204 | Val Accuracy = 0.5
Epoch 23



100%|██████████| 1/1 [00:00<00:00, 138.13it/s]
100%|██████████| 1/1 [00:00<00:00, 134.19it/s]
100%|██████████| 1/1 [00:00<00:00, 203.54it/s]
100%|██████████| 1/1 [00:00<00:00, 188.53it/s]
100%|██████████| 1/1 [00:00<00:00, 203.04it/s]
100%|██████████| 1/1 [00:00<00:00, 84.71it/s]
100%|██████████| 1/1 [00:00<00:00, 141.30it/s]
100%|██████████| 1/1 [00:00<00:00, 460.86it/s]
100%|██████████| 1/1 [00:00<00:00, 87.34it/s]
100%|██████████| 1/1 [00:00<00:00, 243.43it/s]
100%|██████████| 1/1 [00:00<00:00, 192.58it/s]
100%|██████████| 1/1 [00:00<00:00, 43.07it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

Loss = 0.5494279108907204 | Training Accuracy = 0.5 | Val Loss = 0.5435646674733633 | Val Accuracy = 0.5
Epoch 24
Loss = 0.5435646674733633 | Training Accuracy = 0.5 | Val Loss = 0.5376973618864757 | Val Accuracy = 0.5
Epoch 25
Loss = 0.5376973618864757 | Training Accuracy = 1.0 | Val Loss = 0.5318227755472206 | Val Accuracy = 1.0
Epoch 26
Loss = 0.5318227755472206 | Training Accuracy = 1.0 | Val Loss = 0.5259363571864359 | Val Accuracy = 1.0
Epoch 27
Loss = 0.5259363571864359 | Training Accuracy = 1.0 | Val Loss = 0.5199406866832365 | Val Accuracy = 1.0
Epoch 28
Loss = 0.5199406866832365 | Training Accuracy = 1.0 | Val Loss = 0.5140276963299152 | Val Accuracy = 1.0
Epoch 29


100%|██████████| 1/1 [00:00<00:00, 45.54it/s]
100%|██████████| 1/1 [00:00<00:00, 42.49it/s]
100%|██████████| 1/1 [00:00<00:00, 45.72it/s]
100%|██████████| 1/1 [00:00<00:00, 201.99it/s]
100%|██████████| 1/1 [00:00<00:00, 164.17it/s]
100%|██████████| 1/1 [00:00<00:00, 182.88it/s]
100%|██████████| 1/1 [00:00<00:00, 125.98it/s]
100%|██████████| 1/1 [00:00<00:00, 164.86it/s]
100%|██████████| 1/1 [00:00<00:00, 80.33it/s]
100%|██████████| 1/1 [00:00<00:00, 355.51it/s]
100%|██████████| 1/1 [00:00<00:00, 137.14it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

Loss = 0.5140276963299152 | Training Accuracy = 1.0 | Val Loss = 0.508112836453486 | Val Accuracy = 1.0
Epoch 30
Loss = 0.508112836453486 | Training Accuracy = 1.0 | Val Loss = 0.5021864402842487 | Val Accuracy = 1.0
Epoch 31
Loss = 0.5021864402842487 | Training Accuracy = 1.0 | Val Loss = 0.4962361264876561 | Val Accuracy = 1.0
Epoch 32
Loss = 0.4962361264876561 | Training Accuracy = 1.0 | Val Loss = 0.490277272232778 | Val Accuracy = 1.0
Epoch 33
Loss = 0.490277272232778 | Training Accuracy = 1.0 | Val Loss = 0.4843059240695413 | Val Accuracy = 1.0
Epoch 34
Loss = 0.4843059240695413 | Training Accuracy = 1.0 | Val Loss = 0.4783218655342962 | Val Accuracy = 1.0
Epoch 35


100%|██████████| 1/1 [00:00<00:00, 86.64it/s]
100%|██████████| 1/1 [00:00<00:00, 138.65it/s]
100%|██████████| 1/1 [00:00<00:00, 280.67it/s]
100%|██████████| 1/1 [00:00<00:00, 229.03it/s]
100%|██████████| 1/1 [00:00<00:00, 134.74it/s]
100%|██████████| 1/1 [00:00<00:00, 225.89it/s]
100%|██████████| 1/1 [00:00<00:00, 171.15it/s]
100%|██████████| 1/1 [00:00<00:00, 90.34it/s]
100%|██████████| 1/1 [00:00<00:00, 177.59it/s]
100%|██████████| 1/1 [00:00<00:00, 268.38it/s]
100%|██████████| 1/1 [00:00<00:00, 172.53it/s]
100%|██████████| 1/1 [00:00<00:00, 209.24it/s]
100%|██████████| 1/1 [00:00<00:00, 58.72it/s]
100%|██████████| 1/1 [00:00<00:00, 91.38it/s]
100%|██████████| 1/1 [00:00<00:00, 82.43it/s]

Loss = 0.4783218655342962 | Training Accuracy = 1.0 | Val Loss = 0.4723250616753573 | Val Accuracy = 1.0
Epoch 36
Loss = 0.4723250616753573 | Training Accuracy = 1.0 | Val Loss = 0.4663156306655387 | Val Accuracy = 1.0
Epoch 37
Loss = 0.4663156306655387 | Training Accuracy = 1.0 | Val Loss = 0.4602938424419696 | Val Accuracy = 1.0
Epoch 38
Loss = 0.4602938424419696 | Training Accuracy = 1.0 | Val Loss = 0.45426011988950166 | Val Accuracy = 1.0
Epoch 39
Loss = 0.45426011988950166 | Training Accuracy = 1.0 | Val Loss = 0.4482150406078141 | Val Accuracy = 1.0
Epoch 40
Loss = 0.4482150406078141 | Training Accuracy = 1.0 | Val Loss = 0.44217608617621873 | Val Accuracy = 1.0
Epoch 41
Loss = 0.44217608617621873 | Training Accuracy = 1.0 | Val Loss = 0.43611822542674195 | Val Accuracy = 1.0
Epoch 42



100%|██████████| 1/1 [00:00<00:00, 314.56it/s]
100%|██████████| 1/1 [00:00<00:00, 304.80it/s]
100%|██████████| 1/1 [00:00<00:00, 234.65it/s]
100%|██████████| 1/1 [00:00<00:00, 178.57it/s]
100%|██████████| 1/1 [00:00<00:00, 161.60it/s]
100%|██████████| 1/1 [00:00<00:00, 96.39it/s]
100%|██████████| 1/1 [00:00<00:00, 298.97it/s]
100%|██████████| 1/1 [00:00<00:00, 151.26it/s]
100%|██████████| 1/1 [00:00<00:00, 136.14it/s]
100%|██████████| 1/1 [00:00<00:00, 261.05it/s]
100%|██████████| 1/1 [00:00<00:00, 222.97it/s]
100%|██████████| 1/1 [00:00<00:00, 121.36it/s]
100%|██████████| 1/1 [00:00<00:00, 279.64it/s]
100%|██████████| 1/1 [00:00<00:00, 139.02it/s]
100%|██████████| 1/1 [00:00<00:00, 290.95it/s]
100%|██████████| 1/1 [00:00<00:00, 130.55it/s]

Loss = 0.43611822542674195 | Training Accuracy = 1.0 | Val Loss = 0.43004206143760254 | Val Accuracy = 1.0
Epoch 43
Loss = 0.43004206143760254 | Training Accuracy = 1.0 | Val Loss = 0.42396015622036415 | Val Accuracy = 1.0
Epoch 44
Loss = 0.42396015622036415 | Training Accuracy = 1.0 | Val Loss = 0.41790263233523095 | Val Accuracy = 1.0
Epoch 45
Loss = 0.41790263233523095 | Training Accuracy = 1.0 | Val Loss = 0.41190407755654934 | Val Accuracy = 1.0
Epoch 46
Loss = 0.41190407755654934 | Training Accuracy = 1.0 | Val Loss = 0.4057579769021639 | Val Accuracy = 1.0
Epoch 47
Loss = 0.4057579769021639 | Training Accuracy = 1.0 | Val Loss = 0.3996352807660049 | Val Accuracy = 1.0
Epoch 48
Loss = 0.3996352807660049 | Training Accuracy = 1.0 | Val Loss = 0.39353313562626574 | Val Accuracy = 1.0
Epoch 49
Loss = 0.39353313562626574 | Training Accuracy = 1.0 | Val Loss = 0.3874696914090189 | Val Accuracy = 1.0
Epoch 50



100%|██████████| 1/1 [00:00<00:00, 237.30it/s]
100%|██████████| 1/1 [00:00<00:00, 48.10it/s]
100%|██████████| 1/1 [00:00<00:00, 126.24it/s]
100%|██████████| 1/1 [00:00<00:00, 77.94it/s]
100%|██████████| 1/1 [00:00<00:00, 85.83it/s]
100%|██████████| 1/1 [00:00<00:00, 191.06it/s]
100%|██████████| 1/1 [00:00<00:00, 130.18it/s]
100%|██████████| 1/1 [00:00<00:00, 264.67it/s]
100%|██████████| 1/1 [00:00<00:00, 276.25it/s]
100%|██████████| 1/1 [00:00<00:00, 408.13it/s]
100%|██████████| 1/1 [00:00<00:00, 144.70it/s]
100%|██████████| 1/1 [00:00<00:00, 184.19it/s]

Loss = 0.3874696914090189 | Training Accuracy = 1.0 | Val Loss = 0.3814430957233613 | Val Accuracy = 1.0
Epoch 51
Loss = 0.3814430957233613 | Training Accuracy = 1.0 | Val Loss = 0.37535529008373913 | Val Accuracy = 1.0
Epoch 52
Loss = 0.37535529008373913 | Training Accuracy = 1.0 | Val Loss = 0.36929108283024226 | Val Accuracy = 1.0
Epoch 53
Loss = 0.36929108283024226 | Training Accuracy = 1.0 | Val Loss = 0.363239937764867 | Val Accuracy = 1.0
Epoch 54
Loss = 0.363239937764867 | Training Accuracy = 1.0 | Val Loss = 0.3572046736311055 | Val Accuracy = 1.0
Epoch 55
Loss = 0.3572046736311055 | Training Accuracy = 1.0 | Val Loss = 0.35118821397321537 | Val Accuracy = 1.0
Epoch 56



100%|██████████| 1/1 [00:00<00:00, 184.19it/s]
100%|██████████| 1/1 [00:00<00:00, 148.68it/s]
100%|██████████| 1/1 [00:00<00:00, 131.20it/s]
100%|██████████| 1/1 [00:00<00:00, 130.81it/s]
100%|██████████| 1/1 [00:00<00:00, 115.04it/s]
100%|██████████| 1/1 [00:00<00:00, 247.79it/s]
100%|██████████| 1/1 [00:00<00:00, 449.45it/s]
100%|██████████| 1/1 [00:00<00:00, 181.75it/s]
100%|██████████| 1/1 [00:00<00:00, 218.34it/s]
100%|██████████| 1/1 [00:00<00:00, 237.13it/s]
100%|██████████| 1/1 [00:00<00:00, 255.05it/s]
100%|██████████| 1/1 [00:00<00:00, 168.43it/s]
100%|██████████| 1/1 [00:00<00:00, 375.40it/s]
100%|██████████| 1/1 [00:00<00:00, 113.58it/s]
100%|██████████| 1/1 [00:00<00:00, 263.71it/s]
100%|██████████| 1/1 [00:00<00:00, 310.92it/s]

Loss = 0.35118821397321537 | Training Accuracy = 1.0 | Val Loss = 0.3451939099793846 | Val Accuracy = 1.0
Epoch 57
Loss = 0.3451939099793846 | Training Accuracy = 1.0 | Val Loss = 0.3392293899237293 | Val Accuracy = 1.0
Epoch 58
Loss = 0.3392293899237293 | Training Accuracy = 1.0 | Val Loss = 0.333298543147059 | Val Accuracy = 1.0
Epoch 59
Loss = 0.333298543147059 | Training Accuracy = 1.0 | Val Loss = 0.3274849789538702 | Val Accuracy = 1.0
Epoch 60
Loss = 0.3274849789538702 | Training Accuracy = 1.0 | Val Loss = 0.3216502889467744 | Val Accuracy = 1.0
Epoch 61
Loss = 0.3216502889467744 | Training Accuracy = 1.0 | Val Loss = 0.3158549598721529 | Val Accuracy = 1.0
Epoch 62
Loss = 0.3158549598721529 | Training Accuracy = 1.0 | Val Loss = 0.3101118401950801 | Val Accuracy = 1.0
Epoch 63
Loss = 0.3101118401950801 | Training Accuracy = 1.0 | Val Loss = 0.3043205749847308 | Val Accuracy = 1.0
Epoch 64



100%|██████████| 1/1 [00:00<00:00, 216.07it/s]
100%|██████████| 1/1 [00:00<00:00, 102.08it/s]
100%|██████████| 1/1 [00:00<00:00, 398.05it/s]
100%|██████████| 1/1 [00:00<00:00, 359.19it/s]
100%|██████████| 1/1 [00:00<00:00, 186.25it/s]
100%|██████████| 1/1 [00:00<00:00, 153.16it/s]
100%|██████████| 1/1 [00:00<00:00, 143.10it/s]
100%|██████████| 1/1 [00:00<00:00, 141.02it/s]
100%|██████████| 1/1 [00:00<00:00, 335.44it/s]
100%|██████████| 1/1 [00:00<00:00, 484.22it/s]
100%|██████████| 1/1 [00:00<00:00, 174.89it/s]
100%|██████████| 1/1 [00:00<00:00, 38.37it/s]

Loss = 0.3043205749847308 | Training Accuracy = 1.0 | Val Loss = 0.2985832366402187 | Val Accuracy = 1.0
Epoch 65
Loss = 0.2985832366402187 | Training Accuracy = 1.0 | Val Loss = 0.2929100894372488 | Val Accuracy = 1.0
Epoch 66
Loss = 0.2929100894372488 | Training Accuracy = 1.0 | Val Loss = 0.2873159005682637 | Val Accuracy = 1.0
Epoch 67
Loss = 0.2873159005682637 | Training Accuracy = 1.0 | Val Loss = 0.28179807942183566 | Val Accuracy = 1.0
Epoch 68
Loss = 0.28179807942183566 | Training Accuracy = 1.0 | Val Loss = 0.2763970336115318 | Val Accuracy = 1.0
Epoch 69
Loss = 0.2763970336115318 | Training Accuracy = 1.0 | Val Loss = 0.27122320999177324 | Val Accuracy = 1.0
Epoch 70



100%|██████████| 1/1 [00:00<00:00, 140.20it/s]
100%|██████████| 1/1 [00:00<00:00, 70.47it/s]
100%|██████████| 1/1 [00:00<00:00, 141.52it/s]
100%|██████████| 1/1 [00:00<00:00, 62.86it/s]
100%|██████████| 1/1 [00:00<00:00, 56.86it/s]
100%|██████████| 1/1 [00:00<00:00, 265.48it/s]
100%|██████████| 1/1 [00:00<00:00, 62.07it/s]
100%|██████████| 1/1 [00:00<00:00, 87.35it/s]
100%|██████████| 1/1 [00:00<00:00, 60.17it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

Loss = 0.27122320999177324 | Training Accuracy = 1.0 | Val Loss = 0.26698950739042016 | Val Accuracy = 1.0
Epoch 71
Loss = 0.26698950739042016 | Training Accuracy = 1.0 | Val Loss = 0.262147682888571 | Val Accuracy = 1.0
Epoch 72
Loss = 0.262147682888571 | Training Accuracy = 1.0 | Val Loss = 0.2585590891371026 | Val Accuracy = 1.0
Epoch 73
Loss = 0.2585590891371026 | Training Accuracy = 1.0 | Val Loss = 0.2534368845754335 | Val Accuracy = 1.0
Epoch 74
Loss = 0.2534368845754335 | Training Accuracy = 1.0 | Val Loss = 0.25001683383611734 | Val Accuracy = 1.0
Epoch 75


100%|██████████| 1/1 [00:00<00:00, 53.49it/s]
100%|██████████| 1/1 [00:00<00:00, 197.74it/s]
100%|██████████| 1/1 [00:00<00:00, 67.34it/s]
100%|██████████| 1/1 [00:00<00:00, 132.63it/s]
100%|██████████| 1/1 [00:00<00:00, 97.12it/s]
100%|██████████| 1/1 [00:00<00:00, 189.72it/s]
100%|██████████| 1/1 [00:00<00:00, 82.63it/s]
100%|██████████| 1/1 [00:00<00:00, 99.84it/s]
100%|██████████| 1/1 [00:00<00:00, 218.72it/s]
100%|██████████| 1/1 [00:00<00:00, 145.49it/s]
100%|██████████| 1/1 [00:00<00:00, 60.99it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

Loss = 0.25001683383611734 | Training Accuracy = 1.0 | Val Loss = 0.24449831025099122 | Val Accuracy = 1.0
Epoch 76
Loss = 0.24449831025099122 | Training Accuracy = 1.0 | Val Loss = 0.2432071271732523 | Val Accuracy = 1.0
Epoch 77
Loss = 0.2432071271732523 | Training Accuracy = 1.0 | Val Loss = 0.23724331368386004 | Val Accuracy = 1.0
Epoch 78
Loss = 0.23724331368386004 | Training Accuracy = 1.0 | Val Loss = 0.23555377292319116 | Val Accuracy = 1.0
Epoch 79
Loss = 0.23555377292319116 | Training Accuracy = 1.0 | Val Loss = 0.22994759951690777 | Val Accuracy = 1.0
Epoch 80


100%|██████████| 1/1 [00:00<00:00, 53.75it/s]
100%|██████████| 1/1 [00:00<00:00, 77.32it/s]
100%|██████████| 1/1 [00:00<00:00, 397.11it/s]
100%|██████████| 1/1 [00:00<00:00, 325.85it/s]
100%|██████████| 1/1 [00:00<00:00, 261.64it/s]
100%|██████████| 1/1 [00:00<00:00, 173.88it/s]
100%|██████████| 1/1 [00:00<00:00, 239.11it/s]
100%|██████████| 1/1 [00:00<00:00, 135.47it/s]
100%|██████████| 1/1 [00:00<00:00, 304.22it/s]
100%|██████████| 1/1 [00:00<00:00, 149.73it/s]
100%|██████████| 1/1 [00:00<00:00, 143.51it/s]
100%|██████████| 1/1 [00:00<00:00, 117.52it/s]
100%|██████████| 1/1 [00:00<00:00, 235.74it/s]
  0%|          | 0/1 [00:00<?, ?it/s]

Loss = 0.22994759951690777 | Training Accuracy = 1.0 | Val Loss = 0.23197366524627702 | Val Accuracy = 1.0
Epoch 81
Loss = 0.23197366524627702 | Training Accuracy = 1.0 | Val Loss = 0.22554830311888244 | Val Accuracy = 1.0
Epoch 82
Loss = 0.22554830311888244 | Training Accuracy = 1.0 | Val Loss = 0.23242521124116186 | Val Accuracy = 1.0
Epoch 83
Loss = 0.23242521124116186 | Training Accuracy = 1.0 | Val Loss = 0.22154811340942782 | Val Accuracy = 1.0
Epoch 84
Loss = 0.22154811340942782 | Training Accuracy = 1.0 | Val Loss = 0.23260503087773998 | Val Accuracy = 1.0
Epoch 85
Loss = 0.23260503087773998 | Training Accuracy = 1.0 | Val Loss = 0.21854131574214608 | Val Accuracy = 1.0
Epoch 86
Loss = 0.21854131574214608 | Training Accuracy = 1.0 | Val Loss = 0.2362781688769759 | Val Accuracy = 1.0
Epoch 87


100%|██████████| 1/1 [00:00<00:00, 142.34it/s]
100%|██████████| 1/1 [00:00<00:00, 131.95it/s]
100%|██████████| 1/1 [00:00<00:00, 268.97it/s]
100%|██████████| 1/1 [00:00<00:00, 62.36it/s]
100%|██████████| 1/1 [00:00<00:00, 179.02it/s]
100%|██████████| 1/1 [00:00<00:00, 218.10it/s]
100%|██████████| 1/1 [00:00<00:00, 131.68it/s]
100%|██████████| 1/1 [00:00<00:00, 115.29it/s]
100%|██████████| 1/1 [00:00<00:00, 319.20it/s]
100%|██████████| 1/1 [00:00<00:00, 115.54it/s]
100%|██████████| 1/1 [00:00<00:00, 194.37it/s]
100%|██████████| 1/1 [00:00<00:00, 106.71it/s]
100%|██████████| 1/1 [00:00<00:00, 167.03it/s]
100%|██████████| 1/1 [00:00<00:00, 189.80it/s]

Loss = 0.2362781688769759 | Training Accuracy = 1.0 | Val Loss = 0.21358718741315022 | Val Accuracy = 1.0
Epoch 88
Loss = 0.21358718741315022 | Training Accuracy = 1.0 | Val Loss = 0.22592985844357003 | Val Accuracy = 1.0
Epoch 89
Loss = 0.22592985844357003 | Training Accuracy = 1.0 | Val Loss = 0.2034612160325174 | Val Accuracy = 1.0
Epoch 90
Loss = 0.2034612160325174 | Training Accuracy = 1.0 | Val Loss = 0.21676220783560746 | Val Accuracy = 1.0
Epoch 91
Loss = 0.21676220783560746 | Training Accuracy = 1.0 | Val Loss = 0.19280531823601466 | Val Accuracy = 1.0
Epoch 92
Loss = 0.19280531823601466 | Training Accuracy = 1.0 | Val Loss = 0.1969110655540892 | Val Accuracy = 1.0
Epoch 93



100%|██████████| 1/1 [00:00<00:00, 412.42it/s]
100%|██████████| 1/1 [00:00<00:00, 103.26it/s]
100%|██████████| 1/1 [00:00<00:00, 118.08it/s]
100%|██████████| 1/1 [00:00<00:00, 210.89it/s]
100%|██████████| 1/1 [00:00<00:00, 103.33it/s]
100%|██████████| 1/1 [00:00<00:00, 332.80it/s]
100%|██████████| 1/1 [00:00<00:00, 70.79it/s]
100%|██████████| 1/1 [00:00<00:00, 95.91it/s]
100%|██████████| 1/1 [00:00<00:00, 317.37it/s]
100%|██████████| 1/1 [00:00<00:00, 190.36it/s]
100%|██████████| 1/1 [00:00<00:00, 332.99it/s]
100%|██████████| 1/1 [00:00<00:00, 150.97it/s]

Loss = 0.1969110655540892 | Training Accuracy = 1.0 | Val Loss = 0.1796401347055328 | Val Accuracy = 1.0
Epoch 94
Loss = 0.1796401347055328 | Training Accuracy = 1.0 | Val Loss = 0.1797182149730849 | Val Accuracy = 1.0
Epoch 95
Loss = 0.1797182149730849 | Training Accuracy = 1.0 | Val Loss = 0.16826837244524384 | Val Accuracy = 1.0
Epoch 96
Loss = 0.16826837244524384 | Training Accuracy = 1.0 | Val Loss = 0.1661202946381182 | Val Accuracy = 1.0
Epoch 97
Loss = 0.1661202946381182 | Training Accuracy = 1.0 | Val Loss = 0.15856545289348267 | Val Accuracy = 1.0
Epoch 98
Loss = 0.15856545289348267 | Training Accuracy = 1.0 | Val Loss = 0.1556272311083304 | Val Accuracy = 1.0
Epoch 99



100%|██████████| 1/1 [00:00<00:00, 121.29it/s]
100%|██████████| 1/1 [00:00<00:00, 132.12it/s]


Loss = 0.1556272311083304 | Training Accuracy = 1.0 | Val Loss = 0.15042958031156511 | Val Accuracy = 1.0
Epoch 100
Loss = 0.15042958031156511 | Training Accuracy = 1.0 | Val Loss = 0.1473825340878747 | Val Accuracy = 1.0


## Mnist training

In [47]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

In [48]:
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)

In [49]:
y = y.astype(np.int32)
X_train, X_val, y_train, y_val = train_test_split(X, y, 
                                                  test_size=0.25,
                                                  shuffle=True,
                                                  random_state=0)


In [59]:
import matplotlib.pyplot as plt
%matplotlib notebook

In [60]:
# visualize data

def vis(img, label):
    plt.imshow(img, cmap='gray')
    plt.title(label)
    plt.axis('off')
    plt.show()

vis_idx = 1
vis(X_val[vis_idx].reshape(-1, 28), y_val[vis_idx])

<IPython.core.display.Javascript object>

In [55]:
print(X_train.shape)
print(X_val.shape)


# define neural net
model = NN()

# add some layers
model.add_layer(Linear(X.shape[1], 100))
model.add_layer(ReLU())
model.add_layer(Linear(100, 100))
model.add_layer(ReLU())
model.add_layer(Linear(100, 10))


model = train(model, X_train , y_train, minibatch_size=128, epoch=10,
           learning_rate=0.001, X_val=X_val, y_val=y_val)




(52500, 784)
(17500, 784)


  3%|▎         | 12/411 [00:00<00:03, 113.18it/s]

Epoch 1


100%|██████████| 411/411 [00:02<00:00, 152.99it/s]
100%|██████████| 137/137 [00:00<00:00, 442.79it/s]
  4%|▎         | 15/411 [00:00<00:02, 148.94it/s]

Loss = 1.1420023682410956 | Training Accuracy = 0.9049714285714285 | Val Loss = 0.3875152644645091 | Val Accuracy = 0.8949142857142857
Epoch 2


100%|██████████| 411/411 [00:02<00:00, 148.16it/s]
100%|██████████| 137/137 [00:00<00:00, 341.22it/s]
  4%|▎         | 15/411 [00:00<00:02, 149.00it/s]

Loss = 0.35939623320347 | Training Accuracy = 0.9221714285714285 | Val Loss = 0.37786606294773195 | Val Accuracy = 0.9110857142857143
Epoch 3


100%|██████████| 411/411 [00:03<00:00, 130.06it/s]
100%|██████████| 137/137 [00:00<00:00, 306.44it/s]
  4%|▍         | 16/411 [00:00<00:02, 151.49it/s]

Loss = 0.30354123682398443 | Training Accuracy = 0.9327047619047619 | Val Loss = 0.24354684825784356 | Val Accuracy = 0.9215428571428571
Epoch 4


100%|██████████| 411/411 [00:02<00:00, 139.67it/s]
100%|██████████| 137/137 [00:00<00:00, 451.01it/s]
  4%|▎         | 15/411 [00:00<00:02, 148.47it/s]

Loss = 0.27435821175710207 | Training Accuracy = 0.9415238095238095 | Val Loss = 0.21460904383617696 | Val Accuracy = 0.9289714285714286
Epoch 5


100%|██████████| 411/411 [00:02<00:00, 138.56it/s]
100%|██████████| 137/137 [00:00<00:00, 428.64it/s]
  3%|▎         | 14/411 [00:00<00:02, 139.65it/s]

Loss = 0.25493889554212595 | Training Accuracy = 0.9474095238095238 | Val Loss = 0.16863544165394634 | Val Accuracy = 0.9331428571428572
Epoch 6


100%|██████████| 411/411 [00:02<00:00, 145.37it/s]
100%|██████████| 137/137 [00:00<00:00, 437.66it/s]
  1%|          | 3/411 [00:00<00:14, 29.11it/s]

Loss = 0.23949449856027757 | Training Accuracy = 0.956704761904762 | Val Loss = 0.1594868977917347 | Val Accuracy = 0.9411428571428572
Epoch 7


100%|██████████| 411/411 [00:02<00:00, 145.82it/s]
100%|██████████| 137/137 [00:00<00:00, 243.12it/s]
  3%|▎         | 13/411 [00:00<00:03, 125.11it/s]

Loss = 0.226968501697708 | Training Accuracy = 0.9589333333333333 | Val Loss = 0.1443400719258107 | Val Accuracy = 0.9432
Epoch 8


100%|██████████| 411/411 [00:02<00:00, 141.38it/s]
100%|██████████| 137/137 [00:00<00:00, 261.67it/s]
  3%|▎         | 14/411 [00:00<00:02, 134.65it/s]

Loss = 0.21690351248020487 | Training Accuracy = 0.9620190476190477 | Val Loss = 0.13664973892734303 | Val Accuracy = 0.9462857142857143
Epoch 9


100%|██████████| 411/411 [00:02<00:00, 149.11it/s]
100%|██████████| 137/137 [00:00<00:00, 423.51it/s]
  3%|▎         | 14/411 [00:00<00:02, 137.88it/s]

Loss = 0.20875436002957148 | Training Accuracy = 0.9644761904761905 | Val Loss = 0.11619665169314077 | Val Accuracy = 0.9470285714285714
Epoch 10


100%|██████████| 411/411 [00:03<00:00, 125.31it/s]
100%|██████████| 137/137 [00:00<00:00, 372.07it/s]


Loss = 0.20097941819627788 | Training Accuracy = 0.967352380952381 | Val Loss = 0.11332531069233018 | Val Accuracy = 0.9490285714285714


In [66]:
# visualize prediction 

vis_idx = 1000

pred = model.predict(X_val[vis_idx])
vis(X_val[vis_idx].reshape(-1, 28), pred[0])

<IPython.core.display.Javascript object>

# TODO:
1) Add computational graph instead of list, model saving/loading, more optimizers, shedulers, loss functions, operations, gpu support, utility tools ...

... Or simply use Pytorch/TF/whatever