# Neural Network

In [1]:
import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl, numpy as np
from pathlib import Path
from torch import tensor



In [2]:
mpl.rcParams['image.cmap'] = 'gray'
torch.set_printoptions(precision=2, linewidth=140, sci_mode=False)
np.set_printoptions(precision=2, linewidth=140)

path_data = Path('data')
path_gz = path_data/'mnist.pkl.gz'
with gzip.open(path_gz, 'rb') as f: ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])

## Foundations version

### Basic architecture

In [4]:
n,m = x_train.shape
c = y_train.max()+1
n,m,c

(50000, 784, tensor(10))

In [5]:
# num hidden
nh = 50

In [6]:
w1 = torch.randn(m,nh)
b1 = torch.zeros(nh)
w2 = torch.randn(nh,1)
b2 = torch.zeros(1)

In [7]:
def lin(x, w, b): return x@w + b

In [8]:
t = lin(x_valid, w1, b1)
t.shape

torch.Size([10000, 50])

### The Activation Function

Provide non linearity and filter out non positive value

In [9]:
def relu(x): return x.clamp_min(0.)

In [10]:
t = relu(lin(x_valid, w1, b1))
t

tensor([[15.53, 14.29,  3.67,  ...,  0.00,  1.16,  0.00],
        [ 8.17,  0.00,  3.31,  ...,  0.00, 12.33, 11.94],
        [ 6.52,  0.00, 14.41,  ...,  0.00,  3.25,  0.74],
        ...,
        [ 8.92,  1.56,  1.90,  ...,  0.00,  8.35, 15.75],
        [ 3.06,  0.00,  9.02,  ...,  0.00,  3.50,  0.00],
        [ 0.00,  0.00,  8.24,  ...,  0.00,  3.28,  5.19]])

### The model
Tie together the multiple of linear and non linear layer into a network

In [11]:
def model(xb):
    l1 = lin(xb, w1, b1)
    l2 = relu(l1)
    return lin(l2, w2, b2)

In [12]:
res = model(x_valid)
res.shape

torch.Size([10000, 1])

We need to get rid of that trailing (,1), in order to use `mse`.

In [13]:
res[:,0].shape

torch.Size([10000])

### Loss function: 

Use to calculate the error due to wrong prediction. This value is used to adjust the input value in order to reduce the error.



#### Mean squared error

[mse](https://en.wikipedia.org/wiki/Mean_squared_error)

$$\color{chocolate}{ MSE=\frac{1}{n} \sum_{i=1}^n \left(Y_i-\hat{Y_i}\right)^2 }$$

---


(Of course, `mse` is not a suitable loss function for multi-class classification; we'll use a better loss function soon. We'll use `mse` for now to keep things simple.)

In [14]:
def mse(output, targ): return (output[:,0]-targ).pow(2).mean()

In [15]:
y_train,y_valid = y_train.float(),y_valid.float()

In [16]:
preds = model(x_train)
preds.shape

torch.Size([50000, 1])

In [17]:
mse(preds, y_train)

tensor(2588.87)

## The forward and backward passes

- The forwad pass
Start from the input layer the weight, bias and input x is calculated and pass through the activation function. These data are then forward to the next layer in the network on and on until the last layer which output the prediction.

- The backward pass
After the prediction is made the error is computed using the loss function. These error value are pass back to previous layer using the calculus chain-rule. This process is passed back on and on until the computed error reach the input layer.
The error value is then used to adjust the weight and bias which will be used to adjust the input value.

- Then the loop of forward-backward pass start again until the require number of epoch is reached.


### Gradients
A method to calculate the value that would reduce the error that will make the network reach the lowest error.

In [21]:
from sympy import symbols,diff
x,y = symbols('x y')
diff(x**2, x)

2*x

In [20]:
def lin_grad(inp, out, w, b):
    # grad of matmul with respect to input
    inp.g = out.g @ w.t()
    w.g = (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0)
    b.g = out.g.sum(0)

In [22]:
def forward_and_backward(inp, targ):
    # forward pass:
    l1 = inp @ w1 + b1
    l2 = relu(l1)
    out = l2 @ w2 + b2
    diff = out[:,0]-targ
    loss = res.pow(2).mean()
    
    # backward pass:
    out.g = 2.*diff[:,None] / inp.shape[0]
    lin_grad(l2, out, w2, b2)
    l1.g = (l1>0).float() * l2.g
    lin_grad(inp, l1, w1, b1)

In [23]:
forward_and_backward(x_train, y_train)

In [24]:
# Save for testing against later
w1g = w1.g.clone()
w2g = w2.g.clone()
b1g = b1.g.clone()
b2g = b2.g.clone()
ig  = x_train.g.clone()

We cheat a little bit and use PyTorch autograd to check our results.

In [25]:
xt2 = x_train.clone().requires_grad_(True)
w12 = w1.clone().requires_grad_(True)
w22 = w2.clone().requires_grad_(True)
b12 = b1.clone().requires_grad_(True)
b22 = b2.clone().requires_grad_(True)

In [26]:
def forward(inp, targ):
    l1 = inp @ w12 + b12
    l2 = relu(l1)
    out = l2 @ w22 + b22
    return mse(out, targ)

In [27]:
loss = forward(xt2, y_train)
loss.backward()

In [28]:
from fastcore.test import test_close

In [29]:
test_close(w22.grad, w2g, eps=0.01)
test_close(b22.grad, b2g, eps=0.01)
test_close(w12.grad, w1g, eps=0.01)
test_close(b12.grad, b1g, eps=0.01)
test_close(xt2.grad, ig , eps=0.01)

### Refactor model

### Layers as classes

In Python the layer along with other components may be defined as class for convenient.

In [30]:
class Relu():
    def __call__(self, inp):
        self.inp = inp
        self.out = inp.clamp_min(0.)
        return self.out
    
    def backward(self): self.inp.g = (self.inp>0).float() * self.out.g

In [31]:
class Lin():
    def __init__(self, w, b): self.w,self.b = w,b
        
    def __call__(self, inp):
        self.inp = inp
        self.out = inp@self.w + self.b
        return self.out

    def backward(self):
        self.inp.g = self.out.g @ self.w.t()
        self.w.g = self.inp.t() @ self.out.g
        self.b.g = self.out.g.sum(0)

In [33]:
class Mse():
    def __call__(self, inp, targ):
        self.inp = inp
        self.targ = targ
        self.out = (inp.squeeze() - targ).pow(2).mean()
        return self.out
    
    def backward(self):
        self.inp.g = 2. * (self.inp.squeeze() - self.targ).unsqueeze(-1) / self.targ.shape[0]

In [34]:
class Model():
    def __init__(self, w1, b1, w2, b2):
        self.layers = [Lin(w1,b1), Relu(), Lin(w2,b2)]
        self.loss = Mse()
        
    def __call__(self, x, targ):
        for l in self.layers: x = l(x)
        return self.loss(x, targ)
    
    def backward(self):
        self.loss.backward()
        for l in reversed(self.layers): l.backward()

In [35]:
model = Model(w1, b1, w2, b2)

In [36]:
%time loss = model(x_train, y_train)

CPU times: user 486 ms, sys: 12 ms, total: 498 ms
Wall time: 127 ms


In [37]:
%time model.backward()

CPU times: user 637 ms, sys: 310 ms, total: 947 ms
Wall time: 289 ms


In [38]:
test_close(w2g, w2.g, eps=0.01)
test_close(b2g, b2.g, eps=0.01)
test_close(w1g, w1.g, eps=0.01)
test_close(b1g, b1.g, eps=0.01)
test_close(ig, x_train.g, eps=0.01)

### Module.forward()

In [39]:
class Module():
    def __call__(self, *args):
        self.args = args
        self.out = self.forward(*args)
        return self.out

    def forward(self): raise Exception('not implemented')
    def bwd(self): raise Exception('not implemented')
    def backward(self): self.bwd(self.out, *self.args)

In [40]:
class Relu(Module):
    def forward(self, inp): return inp.clamp_min(0.)
    def bwd(self, out, inp): inp.g = (inp>0).float() * out.g

In [41]:
class Lin(Module):
    def __init__(self, w, b): self.w,self.b = w,b
    def forward(self, inp): return inp@self.w + self.b
    def bwd(self, out, inp):
        inp.g = self.out.g @ self.w.t()
        self.w.g = inp.t() @ self.out.g
        self.b.g = self.out.g.sum(0)

In [42]:
class Mse(Module):
    def forward (self, inp, targ): return (inp.squeeze() - targ).pow(2).mean()
    def bwd(self, out, inp, targ): inp.g = 2*(inp.squeeze()-targ).unsqueeze(-1) / targ.shape[0]

In [43]:
model = Model(w1, b1, w2, b2)

In [44]:
%time loss = model(x_train, y_train)

CPU times: user 412 ms, sys: 7.79 ms, total: 420 ms
Wall time: 124 ms


In [45]:
%time model.backward()

CPU times: user 618 ms, sys: 299 ms, total: 917 ms
Wall time: 279 ms


In [46]:
test_close(w2g, w2.g, eps=0.01)
test_close(b2g, b2.g, eps=0.01)
test_close(w1g, w1.g, eps=0.01)
test_close(b1g, b1.g, eps=0.01)
test_close(ig, x_train.g, eps=0.01)

### Pytorch Gradient

Pytorch provide Autograd feature to support automatic gradient calculation.

In [47]:
from torch import nn
import torch.nn.functional as F

In [48]:
class Linear(nn.Module):
    def __init__(self, n_in, n_out):
        super().__init__()
        self.w = torch.randn(n_in,n_out).requires_grad_()
        self.b = torch.zeros(n_out).requires_grad_()
    def forward(self, inp): return inp@self.w + self.b

In [49]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [Linear(n_in,nh), nn.ReLU(), Linear(nh,n_out)]
        
    def __call__(self, x, targ):
        for l in self.layers: x = l(x)
        return F.mse_loss(x, targ[:,None])

In [50]:
model = Model(m, nh, 1)
loss = model(x_train, y_train)
loss.backward()

In [51]:
l0 = model.layers[0]
l0.b.grad

tensor([ -15.35,   -7.43,   -9.78,   -8.08,    2.08, -122.04,  -28.62,   67.35,  -10.98,  -38.75,   19.60,    2.30,  -40.48,   28.39,
         -77.67,   58.37,   64.10,    9.94, -117.07,   15.46,   61.28,  116.95,  -12.61,   98.53,   -4.03,   -2.10,   -1.81,  -11.04,
           9.51,    3.32,  -19.60,    6.68,  -18.60,   -8.50,  -31.13,   30.38,  -53.98,  110.97,  -14.69,   73.12,  105.20,   57.39,
          46.34,   46.75,  -60.66,   53.70,  -26.48,  102.15,   51.43,   49.50])