 # Experiments for the basics of nn and torch implementations

In [1]:
import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl,numpy as np,matplotlib.pyplot as plt
from pathlib import Path

In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

In [3]:
from fastcore.test import test_close

torch.set_printoptions(precision=2, linewidth=140, sci_mode=False)
torch.manual_seed(1)
mpl.rcParams['image.cmap'] = 'gray'

path_data = Path('data')
path_gz = path_data/'mnist.pkl.gz'
with gzip.open(path_gz, 'rb') as f: ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin1')
x_train, y_train, x_valid, y_valid = map(torch.tensor, [x_train, y_train, x_valid, y_valid])

In [57]:
# Dataset info
n,m = x_train.shape
n, n_feats = x_train.shape
c = (y_train.max() + 1).item()
nh = n_hidden = 50

In [5]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        self.layers = [nn.Linear(n_in, nh), nn.ReLU(), nn.Linear(nh, n_out)]

    def __call__(self, x):
        for layer in self.layers: x = layer(x)
        return x

In [6]:
model = Model(m, nh, c)
preds = model(x_train)
preds.shape

torch.Size([50000, 10])

 ## Experimenting with the basics for logsoftmax -- yet again

In [7]:
# setup some vars
n_train = 64
bs = 4
n_classes = 10
probs = torch.randn((bs, n_classes))
y = torch.randint(0, 9, (bs,))

In [8]:
x = probs
a = x.max(1, keepdim=True).values
x_norm = x - a
numerator = x_norm
logsumexp = x_norm.exp().sum(1, keepdim=True).log()
numerator.shape, logsumexp.shape
logsoftmax = numerator - logsumexp
logsoftmax

tensor([[-3.63, -4.00, -2.91, -2.85, -3.18, -1.69, -1.21, -1.24, -5.36, -3.80],
        [-2.45, -1.72, -3.00, -2.75, -1.03, -3.16, -4.40, -2.63, -2.25, -3.46],
        [-2.01, -4.50, -2.86, -3.37, -2.39, -1.48, -2.49, -1.64, -2.12, -3.04],
        [-2.34, -4.65, -2.51, -2.66, -2.41, -3.98, -3.22, -2.17, -1.06, -2.02]])

In [9]:
# Check against a version that doesn't use the normalisation with max
(torch.exp(x) / torch.exp(x).sum(1, keepdim=True)).log()

tensor([[-3.63, -4.00, -2.91, -2.85, -3.18, -1.69, -1.21, -1.24, -5.36, -3.80],
        [-2.45, -1.72, -3.00, -2.75, -1.03, -3.16, -4.40, -2.63, -2.25, -3.46],
        [-2.01, -4.50, -2.86, -3.37, -2.39, -1.48, -2.49, -1.64, -2.12, -3.04],
        [-2.34, -4.65, -2.51, -2.66, -2.41, -3.98, -3.22, -2.17, -1.06, -2.02]])

In [10]:
F.log_softmax(x, 1)

tensor([[-3.63, -4.00, -2.91, -2.85, -3.18, -1.69, -1.21, -1.24, -5.36, -3.80],
        [-2.45, -1.72, -3.00, -2.75, -1.03, -3.16, -4.40, -2.63, -2.25, -3.46],
        [-2.01, -4.50, -2.86, -3.37, -2.39, -1.48, -2.49, -1.64, -2.12, -3.04],
        [-2.34, -4.65, -2.51, -2.66, -2.41, -3.98, -3.22, -2.17, -1.06, -2.02]])

In [11]:
# wrap into function
def log_softmax(x):
    x_norm = x - x.max(1, keepdim=True).values
    numerator = x_norm
    logsumexp = x_norm.exp().sum(1, keepdim=True).log()
    logsoftmax = numerator - logsumexp
    return logsoftmax

assert torch.allclose(log_softmax(probs), F.log_softmax(probs, 1)), 'did not find equality with torch in log_softmax implementation'

In [12]:
log_likelihood = log_softmax(probs)
preds = log_likelihood[range(log_likelihood.shape[0]), y]
nll = -preds.mean()

print(nll, F.nll_loss(F.log_softmax(probs, 1), y), F.cross_entropy(probs, y))

tensor(3.22) tensor(3.22) tensor(3.22)


In [13]:
# wrap into function
def cross_entropy(probs, targets):
    log_likelihood = log_softmax(probs)
    preds = log_likelihood[range(log_likelihood.shape[0]), targets]
    nll = -preds.mean()
    return nll

assert torch.allclose(cross_entropy(probs, y), F.cross_entropy(probs, y)), 'did not find equality with torch in cross_entropy implementation'

 # Basic training loop
 Can now just use the torch implementation of log_softmax and cross_entropy
 - Get model preds
 - compare against labels and calculate loss
 - calculate gradient of loss with respece to model params
 - update params

In [14]:
loss_func = F.cross_entropy

In [15]:
bs = 50
xb = x_train[:bs]
preds = model(xb)
preds[0], preds.shape

(tensor([-0.09, -0.21, -0.08,  0.10, -0.04,  0.08, -0.04, -0.03,  0.01,  0.06], grad_fn=<SelectBackward0>),
 torch.Size([50, 10]))

In [16]:
yb = y_train[:bs]
yb.shape

torch.Size([50])

In [17]:
loss_func(preds, yb)

tensor(2.30, grad_fn=<NllLossBackward0>)

In [18]:
# About the same as this for random init model and c classes
-np.log(1/c)

2.3025850929940455

In [19]:
preds.argmax(1)

tensor([3, 9, 3, 8, 5, 9, 3, 9, 3, 9, 5, 3, 9, 9, 3, 9, 9, 5, 8, 7, 9, 5, 3, 8, 9, 5, 9, 5, 5, 9, 3, 5, 9, 7, 5, 7, 9, 9, 3, 9, 3, 5, 3, 8,
        3, 5, 9, 5, 9, 5])

In [20]:
def accuracy(preds, labels): return (preds.argmax(dim=1) == labels).float().mean()

In [27]:
accuracy(preds, yb)

tensor(0.08)

In [28]:
lr = 0.5
epochs = 3

In [37]:
def report(loss, preds, yb): print(f'loss: {loss:.2f}, accuracy: {accuracy(preds, yb):.2f}')

In [38]:
xb, yb = x_train[:bs], y_train[:bs]
preds = model(xb)
report(loss_func(preds, yb), preds, yb)

loss: 0.21, accuracy: 0.94


In [39]:
for epoch in range(epochs):
    for i in range(0, n, bs):
        s = slice(i, min(n, i+bs))
        xb, yb = x_train[s], y_train[s]
        preds = model(xb)
        loss = loss_func(preds, yb)
        loss.backward()
        with torch.no_grad():
            for layer in model.layers:
                if hasattr(layer, 'weight'):
                    layer.weight -= lr * layer.weight.grad
                    layer.bias -= lr * layer.bias.grad
                    layer.weight.grad.zero_()  # TODO test if can just set to None for simplicity 
                    layer.bias.grad.zero_()
        report(loss, preds, yb)

loss: 0.21, accuracy: 0.94
loss: 0.19, accuracy: 0.98
loss: 0.29, accuracy: 0.94
loss: 0.12, accuracy: 0.98
loss: 0.07, accuracy: 0.98
loss: 0.09, accuracy: 0.94
loss: 0.03, accuracy: 0.98
loss: 0.03, accuracy: 1.00
loss: 0.06, accuracy: 1.00
loss: 0.24, accuracy: 0.96
loss: 0.21, accuracy: 0.96
loss: 0.05, accuracy: 1.00
loss: 0.30, accuracy: 0.94
loss: 0.18, accuracy: 0.98
loss: 0.22, accuracy: 0.94
loss: 0.14, accuracy: 0.96
loss: 0.05, accuracy: 1.00
loss: 0.28, accuracy: 0.94
loss: 0.21, accuracy: 0.98
loss: 0.20, accuracy: 0.94
loss: 0.21, accuracy: 0.94
loss: 0.46, accuracy: 0.86
loss: 0.31, accuracy: 0.84
loss: 0.06, accuracy: 0.98
loss: 0.48, accuracy: 0.94
loss: 0.34, accuracy: 0.86
loss: 0.09, accuracy: 0.98
loss: 0.32, accuracy: 0.94
loss: 0.16, accuracy: 0.96
loss: 0.02, accuracy: 1.00
loss: 0.13, accuracy: 0.98
loss: 0.24, accuracy: 0.94
loss: 0.24, accuracy: 0.96
loss: 0.08, accuracy: 1.00
loss: 0.07, accuracy: 0.98
loss: 0.15, accuracy: 0.94
loss: 0.03, accuracy: 1.00
l

## Parameters in nn.Module

In [47]:
m1 = nn.Module()
m1.foo = nn.Linear(3, 4)
print(m1)
print(list(m1.parameters()))

Module(
  (foo): Linear(in_features=3, out_features=4, bias=True)
)
[Parameter containing:
tensor([[ 0.49,  0.11,  0.22],
        [-0.20,  0.09, -0.14],
        [ 0.48,  0.21,  0.56],
        [-0.01,  0.38,  0.07]], requires_grad=True), Parameter containing:
tensor([ 0.52, -0.18,  0.30,  0.37], requires_grad=True)]


In [52]:
list(m1.named_children())

[('foo', Linear(in_features=3, out_features=4, bias=True))]

In [76]:
class MLP(nn.Module):
    def __init__(self, n_in, n_hidden, n_out):
        super().__init__()
        self.l1 = nn.Linear(n_in, n_hidden)
        self.l2 = nn.Linear(n_hidden, n_out)
        self.relu = nn.ReLU()
    
    def forward(self, x): return self.l2(self.relu(self.l1(x)))

In [81]:
model = MLP(n_feats, nh, c)
model

MLP(
  (l1): Linear(in_features=784, out_features=50, bias=True)
  (l2): Linear(in_features=50, out_features=10, bias=True)
  (relu): ReLU()
)

In [62]:
for name, layer in model.named_children(): print(f'{name}: {layer}')

l1: Linear(in_features=784, out_features=50, bias=True)
l2: Linear(in_features=50, out_features=10, bias=True)
relu: ReLU()


In [66]:
for p in model.parameters(): print(p.shape)

torch.Size([50, 784])
torch.Size([50])
torch.Size([10, 50])
torch.Size([10])


In [82]:
def fit():
    for epoch in range(3):
        for i in range(0, n, bs):
            s = slice(i, min(n, i+bs))
            xb, yb = x_train[s], y_train[s]
            preds = model(xb)
            loss = loss_func(preds, yb)
            loss.backward()
            with torch.no_grad():
                for p in model.parameters(): p -= lr * p.grad
                model.zero_grad()
        report(loss, preds, yb)

In [83]:
fit()

loss: 0.13, accuracy: 0.94
loss: 0.10, accuracy: 0.96
loss: 0.04, accuracy: 1.00
