In [6]:
#| default_exp model

# model

> A simple API for creating and using playing cards

In [7]:
#| export
import torch
from fastcore.utils import *
from fastcore.net import *
from tinypytorch.core import *
from tinypytorch.data import get_sample_data

In [8]:
#| hide
from nbdev.showdoc import *
from fastcore.test import *

#### Samples Data

In [9]:
xb, yb = get_sample_data()

### Model

In [10]:
# #| export
# class Model():
#     def __init__(self, m, nh: "number of hidden"):
#         self.m, self.nh = m, nh
    
#     def forward(self, xb: 'training batch'):
#         w1, b1, w2, b2 = self.initialize_parameters()
#         l1 = Lin(xb, w1, b1)
#         l2 = ReLU(l1)
#         l3 = Lin(l2, w2, b2)
        
#         return l3
    
#     def initialize_parameters(self):
#         # kaiming init / he init for relu
#         w1 = torch.randn(self.m, self.nh)*math.sqrt(2./self.m)
#         b1 = torch.zeros(self.nh)
#         w2 = torch.randn(self.nh, 1)/math.sqrt(self.nh)
#         b2 = torch.zeros
#         return w1, b1, w2, b2

In [11]:
# #| export
# class Model():
#     def __init__(self, )

In [12]:
# #| export
# class Model():
#     def __init__(self, w1, b1, w2, b2):
#         self.layers = []

In [13]:
# #| export
# def get_model():
#     model = nn.Sequential(nn.Linear(m, nh), nn.ReLU(), nn.Linear(nh, 10))
#     return model

In [14]:
#| export
def initialize_parameters(m, nh: "number of hidden layers"):
    # kaiming init / he init for relu
    w1 = torch.randn(m, nh)*math.sqrt(2./m)
    b1 = torch.zeros(nh)
    w2 = torch.randn(nh, 1)/math.sqrt(nh)
    b2 = torch.zeros(1)
    return w1, b1, w2, b2

In [15]:
#| export
class Model():
    
    def __init__(self, n_in, nh, n_out):
        self.w1, self.b1 = self.init_params(n_in, nh)
        self.w2, self.b2 = self.init_params(nh, n_out)
        self.layers = [Lin(self.w1, self.b1), ReLU(), Lin(self.w2, self.b2)]

    def __call__(self, x):
        for l in self.layers:
            # print("Model.__call__")
            # print(f"l={l}")
            x = l(x)
            # print(f"x.shape={x.shape}")
        
        return x
    
    def init_params(self, n, nh: "number of hidden layers"):
        w = torch.randn(n, nh)*math.sqrt(2./n)
        b = torch.zeros(nh)
        return w, b
    
    def backward(self):
        #self.loss.backward()
        for l in reversed(self.layers):
            l.backward()

### Module

In [16]:
#| export
class Module():
    def __call__(self, *args):
        self.args = args # it will call self.inp, and self.targ...
        self.out = self.forward(*args)
        return self.out
    
    def forward(self):
        raise Exception("Not implemented")
    
    def backward(self):
        self.bwd(self.out, *self.args)

### Activation Functions

#### ReLU Function

In [17]:
#| export
class ReLU(Module):
    # def __call__(self, inp: 'input'):
    #     self.inp = inp
    #     self.out = inp.clamp_min(0.) - 0.5
    #     return self.out
    
    def forward(self, inp: 'input'):
        return inp.clamp_min(0.) - 0.5
    
    def bwd(self, out, inp):
        inp.g = (inp > 0).float() * out.g

### Loss Functions

#### Mean Squared Error

In [18]:
#| export
class MSE(Module):
    
#     def __call__(self, inp: 'input', targ: 'target'):
#         self.inp = inp
#         self.targ = targ
        
#         print("MSE.forward")
#         print(f"inp.shape={inp.shape}")
#         print(f"inp.squeeze().shape={inp.squeeze(-1).shape}")
#         print(f"targ.shape={targ.shape}")
        
#         temp = (inp.squeeze() - targ)
#         print(f"temp={temp}")
        
#         return torch.pow(temp, 2).mean()
    
    def forward(self, inp, targ):
        
        # print("MSE.forward")
        # print(f"inp.shape={inp.shape}")
        # print(f"inp.squeeze().shape={inp.squeeze(-1).shape}")
        # print(f"targ.shape={targ.shape}")
        
        temp = (inp.squeeze() - targ)
        # print(f"temp={temp}")
        
        return torch.pow(temp, 2).mean()

    def bwd(self, out, inp, targ):
        inp.g = 2*(inp.squeeze() - targ).unsqueeze(-1) / targ.shape[0]
        
    # def backward(self):
    #     self.inp.g = 2. * (self.inp.squeeze() - self.targ).unsqueeze(-1) / self.targ.shape[0]

In [19]:
mse = MSE()

In [20]:
tensor_a = torch.randn(size=[4, 1])

In [21]:
tensor_a

tensor([[ 1.3878],
        [-2.8296],
        [ 1.1481],
        [-0.3518]])

In [22]:
tensor_b = torch.rand_like(tensor_a)

In [23]:
tensor_b

tensor([[0.0700],
        [0.1494],
        [0.7702],
        [0.4558]])

In [24]:
mse.forward(tensor_a, tensor_b)

tensor(3.1673)

#### Cross-entropy Loss

Cross-entropy is a derivation from Kull-back divergence

##### Log Softmax

$$\hbox{softmax(x)}_{i} = \frac{e^{x_{i}}}{e^{x_{0}} + e^{x_{1}} + \cdots + e^{x_{n-1}}}$$

In [25]:
#| export
def log_softmax(x, dim=-1):
    return (x.exp()/(x.exp().sum(dim, keepdim=True))).log()

In [26]:
sm = log_softmax(xb)

In [33]:
#check_near(log_softmax())

##### Negative Log Likelihood

In [34]:
targ = torch.tensor([5, 0, 4])

In [35]:
sm_pred = torch.tensor([[0, 1, 2], [5, 0, 4]])

In [36]:
sm_pred

tensor([[0, 1, 2],
        [5, 0, 4]])

In [37]:
sm_pred.shape

torch.Size([2, 3])

In [38]:
sm_pred[[0, 1], [2, 1]]

tensor([2, 0])

##### Negative Log Likelihood

In [39]:
yb.shape

torch.Size([64])

In [40]:
xb.shape

torch.Size([64, 784])

In [41]:
xb[range(yb.shape[0]), yb]

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [44]:
#nll(xb, yb)

In [45]:
#| export
def nll(inp: 'input', targ: 'target'):
    print(f"inp.shape={inp.shape}")
    print(f"targ.shape={targ.shape}")
    return -inp[range(targ.shape[0]), targ].mean()

##### Cross-entropy loss

In [46]:
#| export
def cross_entropy(pred: 'prediction', targ: 'target'):
    sm_pred = log_softmax(pred)
    return nll(sm_pred, targ)

In [47]:
#| export
class CrossEntropy(Module):
    def __init__(self, debug=False):
        self.debug = debug
        pass

    def __call__(self, pred: 'predictions', targ: 'targets'):
        
        if self.debug == True: print(f"pred.shape={pred.shape}")
        
        sm_pred = log_softmax(pred)
        
        if self.debug == True: print(f"sm_pred.shape={sm_pred.shape}")
        
        return nll(sm_pred, targ)
    
    def bwd(self):
        pass

In [48]:
pred = torch.tensor([[0, 1, 2], [5, 0, 4]])

In [49]:
targ = torch.tensor([2, 1])

In [50]:
cross_entropy(pred, targ)

inp.shape=torch.Size([2, 3])
targ.shape=torch.Size([2])


tensor(2.8629)

### Linear layer

In [51]:
# #| export
# class Lin():
#     def __init__(self, w, b):
#         self.w = w
#         self.b = b
    
#     def __call__(self, inp):
#         self.inp = inp
#         self.out = inp @ self.w + self.b
#         return self.out
    
#     def backward(self):
#         self.inp.g = self.out.g @ self.w.t()
#         self.w.g = (self.inp.unsqueeze(-1) * self.out.g.unsqueeze(1)).sum(0)
#         self.b.g = self.out.g.sum(0)

#### The gradient of Linear Layer

In [52]:
#| export
class Lin(Module):
    def __init__(self, w: 'weight', b: 'bias'):
        self.w, self.b = w, b
    
    # def __call__(self, inp):
    #     self.inp = inp
    #     self.out = inp @ self.w + self.b
    #     return self.out
    
    def forward(self, inp: 'input'):
        print("Lin.forward")
        print(f"inp={inp.shape}")
        print(f"w={self.w.shape}")
        print(f"b={self.b.shape}")
        
        output = inp @ self.w + self.b
        print(f"output.shape={output.shape}")
        return output
    
    def bwd(self, out, inp):
        inp.g = out.g @ self.w.t()
        # self.w.g = torch.einsum("bi,bj->ij", inp, out.g)
        self.w.g = inp.t() @ out.g
        # self.w.g = (self.inp.unsqueeze(-1) * self.out.g.unsqueeze(1)).sum(0)
        self.b.g = out.g.sum(0)
        #self.w.g = torch.einsum("bi,bj->ij", self.inp, self.out.g)
        #self.b.g = out.g.sum(0)