In [1]:
import numpy as np 
import _pickle as cPickle
import gzip
import os
from sklearn.utils import shuffle
from tqdm import tqdm

In [2]:
%config IPCompleter.greedy=True

In [91]:
# utility functions

def one_hot_encoded(y, num_class):
    n = y.shape[0]
    onehot = np.zeros((n, num_class), dtype="int32")
    for i in range(n):
        idx = y[i]
        onehot[i][idx] = 1
    return onehot


def check_accuracy(y_true, y_pred):
    return np.mean(y_pred == y_true)  # both are not one hot encoded


def softmax(x):
    exp_x = np.exp(x - np.max(x, axis = 1, keepdims = True)) # (n, m)
    return (exp_x / np.sum(exp_x, axis = 1, keepdims = True))
    
# l2 regularization
def l2_reg(layers, lam=0.001):
    reg_loss = 0.0
    for layer in layers:
        if hasattr(layer, 'W'):
            reg_loss += 0.5 * lam * np.sum(layer.W * layer.W)
    return reg_loss


# l2 regularization grad
def delta_l2_reg(layers, grads, lam=0.001):
    for layer, grad in zip(layers, reversed(grads)):
        if hasattr(layer, 'W'):
            grad[0] += lam * layer.W
    return grads


In [4]:
def eval_numerical_gradient(f, x, verbose=False, h=0.00001):
    """Evaluates gradient df/dx via finite differences:
    df/dx ~ (f(x+h) - f(x-h)) / 2h
    Adopted from https://github.com/ddtm/dl-course/
    """
    fx = f(x) # evaluate function value at original point
    grad = np.zeros_like(x)
    # iterate over all indexes in x
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:

        # evaluate function at x+h
        ix = it.multi_index
        oldval = x[ix]
        x[ix] = oldval + h # increment by h
        fxph = f(x) # evalute f(x + h)
        x[ix] = oldval - h
        fxmh = f(x) # evaluate f(x - h)
        x[ix] = oldval # restore

        # compute the partial derivative with centered formula
        grad[ix] = (fxph - fxmh) / (2 * h) # the slope
        if verbose:
            print (ix, grad[ix])
        it.iternext() # step to next dimension

    return grad

In [112]:
class ReLU():
    def __init__(self):
        self.params = []
        self.gradInput = None

    def forward(self, X, mode):
        # YOUR CODE HERE
        self.X = X
        return np.maximum(X, 0)
    
    def backward(self, dout, mode):
        # YOUR CODE HERE
        self.gradInput = dout.copy() # dout - апстрим градиент
        self.gradInput[self.X <= 0] = 0
        
        return self.gradInput, []

In [29]:
points = np.linspace(-1, 1, 10*12).reshape([10, 12])
relu = ReLU()
f = lambda x: relu.forward(x, mode='train').sum(axis=1).sum()
res = f(points)
numeric_grads = eval_numerical_gradient(f, points)
print(numeric_grads)
inp_grad = np.ones(shape=(10, 12))
grads = relu.backward(inp_grad, mode='train')[0]
assert np.allclose(grads, numeric_grads, rtol=1e-3, atol=0)

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]


In [92]:
class Linear():

    def __init__(self, in_size, out_size):

        # Xavier init
        self.W = np.random.randn(in_size, out_size) / np.sqrt(in_size + out_size/ 2.)
        self.b = np.zeros((1, out_size))
        self.params = [self.W, self.b]
        self.gradW = None
        self.gradB = None
        self.gradInput = None

    def forward(self, X, mode):
        # YOUR CODE HERE
        self.X = X
        output = self.X.dot(self.W) + self.b
        return output
    
    def backward(self, dout, mode):
        # YOUR CODE HERE
#         dout - примеры Х
        self.gradW = self.X.T.dot(dout)
        self.gradB = np.mean(dout, axis = 0)
        self.gradInput = dout.dot(self.W.T)
        return self.gradInput, [self.gradW, self.gradB]

In [52]:
points = np.linspace(-1, 1, 10*12).reshape([10, 12])
linear = Linear(12, 5)
f = lambda x: linear.forward(x, mode='train').sum(axis=1).sum()
res = f(points)
numeric_grads = eval_numerical_gradient(f, points)
print(numeric_grads)
inp_grad = np.ones(shape=(10, 5))
grads = linear.backward(inp_grad, mode='train')[0]
assert np.allclose(grads, numeric_grads, rtol=1e-3, atol=0)

[[ 0.04121997 -0.59634506 -0.32930073  0.18663707 -0.47043889  0.53259939
  -0.0373722   1.01738207  0.89768952  0.57644973  0.1149812  -0.58963592]
 [ 0.04121997 -0.59634506 -0.32930073  0.18663707 -0.47043889  0.53259939
  -0.0373722   1.01738207  0.89768952  0.57644973  0.1149812  -0.58963592]
 [ 0.04121997 -0.59634506 -0.32930073  0.18663707 -0.47043889  0.53259939
  -0.0373722   1.01738207  0.89768952  0.57644973  0.1149812  -0.58963592]
 [ 0.04121997 -0.59634506 -0.32930073  0.18663707 -0.47043889  0.53259939
  -0.0373722   1.01738207  0.89768952  0.57644973  0.1149812  -0.58963592]
 [ 0.04121997 -0.59634506 -0.32930073  0.18663707 -0.47043889  0.53259939
  -0.0373722   1.01738207  0.89768952  0.57644973  0.1149812  -0.58963592]
 [ 0.04121997 -0.59634506 -0.32930073  0.18663707 -0.47043889  0.53259939
  -0.0373722   1.01738207  0.89768952  0.57644973  0.1149812  -0.58963592]
 [ 0.04121997 -0.59634506 -0.32930073  0.18663707 -0.47043889  0.53259939
  -0.0373722   1.01738207  0.897

In [93]:
class CrossEntropyLoss(object):
#     https://deepnotes.io/softmax-crossentropy

    def forward(self, X, y):
        # YOUR CODE HERE
#       X(3, 4) 3 rows, 4 col
#       y(3, 1)
        self.m = y.shape[0] # m - кол-во примеров, y(m, 1), y.shape=[m-строк, 1-колонка],  m - const
        self.p = softmax(X) # p - (3, 2) 2 classes
#         -log(p_true)
# кросс-энтропия для мультиклассовой классификации
        cross_entropy = -np.log(self.p[range(self.m), y]) #cross_entropy(1, m), cross_entropy - (1, 3)
        loss = np.sum(cross_entropy) / self.m
        return loss
    
    def backward(self, X, y):
        # YOUR CODE HERE
        dx = self.p.copy()
        dx[range(self.m), y] -= 1
        dx /= self.m
        return dx

## NN implementation

In [117]:
class NN:

    def __init__(self, loss_func=CrossEntropyLoss(), mode = 'train'):
    
        self.layers = []
        self.params = []
        self.loss_func = loss_func
        self.grads = []
        self.mode = mode

    def add_layer(self,layer):
        self.layers.append(layer)
        self.params.append(layer.params)

    def forward(self, X):
        print('0', X)
        for layer in self.layers:
            X = layer.forward(X, self.mode)
            print('1', X)
        return X

    def backward(self, dout):
        self.clear_grad_param()
        for layer in reversed(self.layers):
            dout, grad = layer.backward(dout, self.mode)
            self.grads.append(grad)
        return self.grads

    def train_step(self, X, y):
#         print('0', X) (3, 4)
        out = self.forward(X)
        loss = self.loss_func.forward(out,y)
#         print('1', out) (3, 2)
        dout = self.loss_func.backward(out,y)
        loss += l2_reg(self.layers)
        grads = self.backward(dout)
        grads = delta_l2_reg(self.layers, grads)
        return loss, grads

    def predict(self, X):
        X = self.forward(X)
        return np.argmax(softmax(X), axis=1)


    def dispGradParam():
        print(self.grads)
    

    def clear_grad_param(self):
        self.grads = []

In [118]:
# SGD with momentum
def update(velocity, params, grads, learning_rate=0.001, mu=0.9):
    for v, p, g, in zip(velocity, params, reversed(grads)):
        for i in range(len(g)):
            v[i] = mu * v[i] + learning_rate * g[i]
            p[i] -= v[i]


# get minibatches
def minibatch(X, y, minibatch_size):
    n = X.shape[0]
    minibatches = []
    X, y = shuffle(X, y)

    for i in range(0, n , minibatch_size):
        X_batch = X[i:i + minibatch_size, ...]
        y_batch = y[i:i + minibatch_size, ...]

        minibatches.append((X_batch, y_batch))
    return minibatches


In [119]:
def train(net, X_train, y_train, minibatch_size, epoch, learning_rate, mu=0.9,
                 verbose=True, X_val=None, y_val=None, nesterov=True):
    val_loss_epoch = []
    minibatches = minibatch(X_train, y_train, minibatch_size)
    minibatches_val = minibatch(X_val, y_val, minibatch_size)
    c = 0 

    for i in range(epoch):
        loss_batch = []
        val_loss_batch = []
        velocity = []
        for param_layer in net.params:
            p = [np.zeros_like(param) for param in list(param_layer)]
            velocity.append(p)

        if verbose:
            print("Epoch {0}".format(i + 1))

        # iterate over mini batches
        for X_mini, y_mini in tqdm(minibatches):

            loss, grads = net.train_step(X_mini, y_mini)
            loss_batch.append(loss)
            update(velocity, net.params, grads,
                            learning_rate=learning_rate, mu=mu)

        for X_mini_val, y_mini_val in tqdm(minibatches_val):
            val_loss, _ = net.train_step(X_mini, y_mini)
            val_loss_batch.append(val_loss)


        # accuracy of model at end of epoch after all mini batch updates   

        if verbose:
            m_train = X_train.shape[0]
            m_val = X_val.shape[0]
            y_train_pred = np.array([], dtype="int64")
            y_val_pred = np.array([], dtype="int64")

            for i in range(0, m_train, minibatch_size):
                X_tr = X_train[i:i + minibatch_size, : ]
                y_tr = y_train[i:i + minibatch_size, ]
                y_train_pred = np.append(y_train_pred, net.predict(X_tr))

            for i in range(0, m_val, minibatch_size):
                X_va = X_val[i:i + minibatch_size, : ]
                y_va = y_val[i:i + minibatch_size, ]
                y_val_pred = np.append(y_val_pred, net.predict(X_va))

            train_acc = check_accuracy(y_train, y_train_pred)
            val_acc = check_accuracy(y_val, y_val_pred)

            mean_train_loss = sum(loss_batch) / float(len(loss_batch))
            mean_val_loss = sum(val_loss_batch) / float(len(val_loss_batch))


            # early stopping with patience = 5 on val loss

            if len(val_loss_epoch) == 0:
                val_loss_epoch.append(mean_val_loss)
            else:
                for j in val_loss_epoch[-5:]:
                    if mean_val_loss > j:
                        c += 1
                    else:
                        c = 0
                if c > 5:
                    print('Early stopping')
                    return net
                else:
                    c = 0
                    val_loss_epoch.append(mean_val_loss)    


            print("Loss = {0} | Training Accuracy = {1} | Val Loss = {2} | Val Accuracy = {3}".format(
                mean_train_loss, train_acc, mean_val_loss, val_acc))
    return net


In [122]:

# Get preprocessed training and validation data

X_train = np.array([
    [1, 2, 1, 2],
    [2, 4, 2, 4],
    [2, 1, 2, 1],
#     [4, 2, 4, 2],
])

y_train = np.array([0, 1, 0])
X_val = X_train.copy()
y_val = y_train.copy()

print(X_train.shape)
print(X_val.shape)


# define neural net
model = NN()

# add some layers
# YOUR CODE HERE
model.add_layer(Linear(4, 10))
model.add_layer(ReLU())

model.add_layer(Linear(10, 100))
model.add_layer(ReLU())

model.add_layer(Linear(100, 2))

model = train(model, X_train , y_train, minibatch_size=4, epoch=10,
           learning_rate=0.001, X_val=X_val, y_val=y_val)




  0%|                                                                                            | 0/1 [00:00<?, ?it/s]

(3, 4)
(3, 4)
Epoch 1
0 [[1 2 1 2]
 [2 1 2 1]
 [2 4 2 4]]
1 [[-0.30770505  1.52829203  0.91186921  0.35304932 -0.58492541 -0.05842863
   1.19154208  1.15979249  0.22739964  0.51616224  1.93064404  0.65287408
   0.12097362 -0.02523942  0.87559968  0.5293284   0.61203781 -0.91741981
   0.02203616 -0.44816322]
 [-1.16281283  0.50000812 -0.25917544  0.7983146   0.14464546 -0.7454694
   1.46351355  0.12485205 -0.76148193  0.68795916  2.15273857  0.43319261
  -0.37533438 -0.42505939  0.81207161  0.68576382  0.50167556 -1.09776936
   0.2959638  -1.09295816]
 [-0.6154101   3.05658406  1.82373842  0.70609865 -1.16985082 -0.11685726
   2.38308417  2.31958499  0.45479928  1.03232448  3.86128809  1.30574817
   0.24194725 -0.05047884  1.75119937  1.05865679  1.22407562 -1.83483961
   0.04407232 -0.89632643]]
0 [[-0.30770505  1.52829203  0.91186921  0.35304932 -0.58492541 -0.05842863
   1.19154208  1.15979249  0.22739964  0.51616224  1.93064404  0.65287408
   0.12097362 -0.02523942  0.87559968  0.52




ValueError: shapes (3,20) and (10,100) not aligned: 20 (dim 1) != 10 (dim 0)

## Mnist training

In [55]:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split

In [56]:
X, y = fetch_openml('mnist_784', version=1, return_X_y=True)

In [65]:
y = y.astype(np.int32)
X_train, X_val, y_train, y_val = train_test_split(X, y, 
                                                  test_size=0.25,
                                                  shuffle=True,
                                                  random_state=0)




In [105]:
# X_train = X_train.to_numpy()
# X_val = X_val.to_numpy()
y_train = y_train.to_numpy()
y_val = y_val.to_numpy()

In [66]:
import matplotlib.pyplot as plt
%matplotlib notebook

In [107]:
# visualize data

def vis(img, label):
    plt.imshow(img, cmap='gray')
    plt.title(label)
    plt.axis('off')
    plt.show()

vis_idx = 2
vis(X_val[vis_idx].reshape(-1, 28), y_val[vis_idx])

<IPython.core.display.Javascript object>

In [112]:
print(X_train.shape)
print(X_val.shape)


# define neural net
model = NN()

# add some layers
# YOUR CODE HERE
model.add_layer(Linear(X.shape[1], 100))
model.add_layer(ReLU())

model.add_layer(Linear(100, 100))
model.add_layer(ReLU())

model.add_layer(Linear(100, 10))

model = train(model, X_train , y_train, minibatch_size=128, epoch=10, learning_rate=0.001, X_val=X_val, y_val=y_val)




(52500, 784)
(17500, 784)


  2%|█▌                                                                                | 8/411 [00:00<00:05, 75.50it/s]

Epoch 1


100%|███████████████████████████████████████████████████████████████████████████████| 411/411 [00:04<00:00, 101.53it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 137/137 [00:00<00:00, 306.02it/s]
  2%|█▊                                                                                | 9/411 [00:00<00:04, 86.58it/s]

Loss = 1.0084065711265628 | Training Accuracy = 0.9184761904761904 | Val Loss = 0.34976904521519697 | Val Accuracy = 0.9091428571428571
Epoch 2


100%|███████████████████████████████████████████████████████████████████████████████| 411/411 [00:04<00:00, 101.86it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 137/137 [00:00<00:00, 304.62it/s]
  2%|█▊                                                                                | 9/411 [00:00<00:04, 89.16it/s]

Loss = 0.3165736507510502 | Training Accuracy = 0.9419428571428572 | Val Loss = 0.2416673052187373 | Val Accuracy = 0.9276571428571428
Epoch 3


100%|███████████████████████████████████████████████████████████████████████████████| 411/411 [00:03<00:00, 103.61it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 137/137 [00:00<00:00, 185.13it/s]
  2%|█▊                                                                                | 9/411 [00:00<00:04, 87.43it/s]

Loss = 0.2636624918109807 | Training Accuracy = 0.9541333333333334 | Val Loss = 0.22899000156628424 | Val Accuracy = 0.9373142857142858
Epoch 4


100%|███████████████████████████████████████████████████████████████████████████████| 411/411 [00:04<00:00, 102.24it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 137/137 [00:00<00:00, 303.57it/s]
  2%|█▌                                                                                | 8/411 [00:00<00:05, 73.76it/s]

Loss = 0.23569676053639618 | Training Accuracy = 0.9593714285714285 | Val Loss = 0.2347103851578067 | Val Accuracy = 0.9402857142857143
Epoch 5


100%|███████████████████████████████████████████████████████████████████████████████| 411/411 [00:03<00:00, 106.00it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 137/137 [00:00<00:00, 281.18it/s]
  2%|█▉                                                                               | 10/411 [00:00<00:04, 93.51it/s]

Loss = 0.215880444413979 | Training Accuracy = 0.9645142857142858 | Val Loss = 0.21028766774396598 | Val Accuracy = 0.9451428571428572
Epoch 6


100%|███████████████████████████████████████████████████████████████████████████████| 411/411 [00:03<00:00, 105.83it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 137/137 [00:00<00:00, 265.14it/s]
  2%|█▊                                                                                | 9/411 [00:00<00:04, 83.38it/s]

Loss = 0.20172151934038504 | Training Accuracy = 0.9679809523809524 | Val Loss = 0.1995315820585262 | Val Accuracy = 0.948
Epoch 7


100%|███████████████████████████████████████████████████████████████████████████████| 411/411 [00:04<00:00, 102.25it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 137/137 [00:00<00:00, 282.63it/s]
  2%|█▊                                                                                | 9/411 [00:00<00:04, 84.16it/s]

Loss = 0.19053165471251673 | Training Accuracy = 0.9715428571428572 | Val Loss = 0.16568498491099934 | Val Accuracy = 0.9513142857142857
Epoch 8


100%|████████████████████████████████████████████████████████████████████████████████| 411/411 [00:04<00:00, 85.66it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 137/137 [00:00<00:00, 283.06it/s]
  2%|█▌                                                                                | 8/411 [00:00<00:05, 79.64it/s]

Loss = 0.18102290361296117 | Training Accuracy = 0.9735238095238096 | Val Loss = 0.15144307740076704 | Val Accuracy = 0.9516571428571429
Epoch 9


100%|███████████████████████████████████████████████████████████████████████████████| 411/411 [00:03<00:00, 104.35it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 137/137 [00:00<00:00, 270.37it/s]
  2%|█▊                                                                                | 9/411 [00:00<00:04, 88.29it/s]

Loss = 0.17285792527341803 | Training Accuracy = 0.9746095238095238 | Val Loss = 0.14575162131708194 | Val Accuracy = 0.9522285714285714
Epoch 10


100%|███████████████████████████████████████████████████████████████████████████████| 411/411 [00:03<00:00, 107.11it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 137/137 [00:00<00:00, 261.92it/s]


Loss = 0.1656912392314272 | Training Accuracy = 0.9777333333333333 | Val Loss = 0.1107158288552554 | Val Accuracy = 0.9541714285714286


In [120]:
# visualize prediction 

vis_idx = 9
pred = model.predict(X_val[vis_idx])
vis(X_val[vis_idx].reshape(-1, 28), pred[0])

<IPython.core.display.Javascript object>

# TODO:
1) Add computational graph instead of list, model saving/loading, more optimizers, shedulers loss functions, operations, gpu support, utility tools ...

... Or simply use Pytorch/TF/whatever