 # Experiments for the basics of nn and torch implementations

In [1]:
import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl,numpy as np,matplotlib.pyplot as plt
from pathlib import Path

In [2]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

In [3]:
from fastcore.test import test_close

torch.set_printoptions(precision=2, linewidth=140, sci_mode=False)
torch.manual_seed(1)
mpl.rcParams['image.cmap'] = 'gray'

path_data = Path('data')
path_gz = path_data/'mnist.pkl.gz'
with gzip.open(path_gz, 'rb') as f: ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin1')
x_train, y_train, x_valid, y_valid = map(torch.tensor, [x_train, y_train, x_valid, y_valid])

In [4]:
# Dataset info
n,m = x_train.shape
c = (y_train.max() + 1).item()
nh = 50

In [5]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        self.layers = [nn.Linear(n_in, nh), nn.ReLU(), nn.Linear(nh, n_out)]

    def __call__(self, x):
        for layer in self.layers: x = layer(x)
        return x

In [6]:
model = Model(m, nh, c)
preds = model(x_train)
preds.shape

torch.Size([50000, 10])

 ## Experimenting with the basics for logsoftmax -- yet again

In [7]:
# setup some vars
n_train = 64
bs = 4
n_classes = 10
probs = torch.randn((bs, n_classes))
y = torch.randint(0, 9, (bs,))

In [8]:
x = probs
a = x.max(1, keepdim=True).values
x_norm = x - a
numerator = x_norm
logsumexp = x_norm.exp().sum(1, keepdim=True).log()
numerator.shape, logsumexp.shape
logsoftmax = numerator - logsumexp
logsoftmax

tensor([[-3.63, -4.00, -2.91, -2.85, -3.18, -1.69, -1.21, -1.24, -5.36, -3.80],
        [-2.45, -1.72, -3.00, -2.75, -1.03, -3.16, -4.40, -2.63, -2.25, -3.46],
        [-2.01, -4.50, -2.86, -3.37, -2.39, -1.48, -2.49, -1.64, -2.12, -3.04],
        [-2.34, -4.65, -2.51, -2.66, -2.41, -3.98, -3.22, -2.17, -1.06, -2.02]])

In [9]:
# Check against a version that doesn't use the normalisation with max
(torch.exp(x) / torch.exp(x).sum(1, keepdim=True)).log()

tensor([[-3.63, -4.00, -2.91, -2.85, -3.18, -1.69, -1.21, -1.24, -5.36, -3.80],
        [-2.45, -1.72, -3.00, -2.75, -1.03, -3.16, -4.40, -2.63, -2.25, -3.46],
        [-2.01, -4.50, -2.86, -3.37, -2.39, -1.48, -2.49, -1.64, -2.12, -3.04],
        [-2.34, -4.65, -2.51, -2.66, -2.41, -3.98, -3.22, -2.17, -1.06, -2.02]])

In [10]:
F.log_softmax(x, 1)

tensor([[-3.63, -4.00, -2.91, -2.85, -3.18, -1.69, -1.21, -1.24, -5.36, -3.80],
        [-2.45, -1.72, -3.00, -2.75, -1.03, -3.16, -4.40, -2.63, -2.25, -3.46],
        [-2.01, -4.50, -2.86, -3.37, -2.39, -1.48, -2.49, -1.64, -2.12, -3.04],
        [-2.34, -4.65, -2.51, -2.66, -2.41, -3.98, -3.22, -2.17, -1.06, -2.02]])

In [11]:
# wrap into function
def log_softmax(x):
    x_norm = x - x.max(1, keepdim=True).values
    numerator = x_norm
    logsumexp = x_norm.exp().sum(1, keepdim=True).log()
    logsoftmax = numerator - logsumexp
    return logsoftmax

assert torch.allclose(log_softmax(probs), F.log_softmax(probs, 1)), 'did not find equality with torch in log_softmax implementation'

In [12]:
log_likelihood = log_softmax(probs)
preds = log_likelihood[range(log_likelihood.shape[0]), y]
nll = -preds.mean()

print(nll, F.nll_loss(F.log_softmax(probs, 1), y), F.cross_entropy(probs, y))

tensor(3.22) tensor(3.22) tensor(3.22)


In [13]:
# wrap into function
def cross_entropy(probs, targets):
    log_likelihood = log_softmax(probs)
    preds = log_likelihood[range(log_likelihood.shape[0]), targets]
    nll = -preds.mean()
    return nll

assert torch.allclose(cross_entropy(probs, y), F.cross_entropy(probs, y)), 'did not find equality with torch in cross_entropy implementation'

 # Basic training loop
 Can now just use the torch implementation of log_softmax and cross_entropy
 - Get model preds
 - compare against labels and calculate loss
 - calculate gradient of loss with respece to model params
 - update params

In [14]:
loss_func = F.cross_entropy

In [15]:
bs = 50
xb = x_train[:bs]
preds = model(xb)
preds[0], preds.shape

(tensor([-0.09, -0.21, -0.08,  0.10, -0.04,  0.08, -0.04, -0.03,  0.01,  0.06], grad_fn=<SelectBackward0>),
 torch.Size([50, 10]))

In [16]:
yb = y_train[:bs]
yb.shape

torch.Size([50])

In [17]:
loss_func(preds, yb)

tensor(2.30, grad_fn=<NllLossBackward0>)

In [18]:
# About the same as this for random init model and c classes
-np.log(1/c)

2.3025850929940455

In [19]:
preds.argmax(1)

tensor([3, 9, 3, 8, 5, 9, 3, 9, 3, 9, 5, 3, 9, 9, 3, 9, 9, 5, 8, 7, 9, 5, 3, 8, 9, 5, 9, 5, 5, 9, 3, 5, 9, 7, 5, 7, 9, 9, 3, 9, 3, 5, 3, 8,
        3, 5, 9, 5, 9, 5])