In [1]:
# | default_exp training

In [2]:
# |export

import torch
from torch.utils.data import DataLoader

# Minibatch training

## Data

In [3]:
# Download the MNIST dataset
import gzip
import pickle
import matplotlib as mpl
import matplotlib.pyplot as plt
from torch import tensor
from pathlib import Path
from urllib.request import urlretrieve

MNIST_URL = "https://github.com/mnielsen/neural-networks-and-deep-learning/blob/master/data/mnist.pkl.gz?raw=true"

data_path = Path("data/mnist")
data_path.mkdir(exist_ok=True)
gz_path = data_path / "mnist.pkl.gz"

mpl.rcParams["image.cmap"] = "gray"

if not gz_path.exists():
    urlretrieve(MNIST_URL, gz_path)

# File contains a tuple of tuples for the x and y, train and validation data
# Images are 28x28
with gzip.open(gz_path, "rb") as file:
    ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(file, encoding="latin-1")

# Put into tensors
x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])

## Model

In [4]:
# Lets define some constants that describe the data
n_examples, n_pixels = x_train.shape
possible_values = y_train.max() + 1

# How many nodes/activations/line thingys
n_hidden = 50

n_examples, n_pixels, possible_values

(50000, 784, tensor(10))

In [5]:
from torch import nn
import torch.nn.functional as F


class Model(nn.Module):
    def __init__(self, n_in, n_hidden, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in, n_hidden), nn.ReLU(), nn.Linear(n_hidden, n_out)]

    def __call__(self, inp):
        res = inp
        for layer in self.layers:
            res = layer(res)

        return res

In [6]:
model = Model(n_pixels, n_hidden, 10)
pred = model(x_train)
pred.shape

torch.Size([50000, 10])

## Cross entropy loss

We need to improve our loss function as MSE doesn't make sense, the distance between incorrecd predictions doesn't indicate how good/bad the prediction is. For example if the target is 2, then 3 isnt a better guess than 9.

We are outputting a pred for each possible number. As only one is possible at a time our targets are a one hot encoded matrix. If our targets are going to sum to 1, then it makes sense that our preds do too. We can calculate the softmax for each pred. Then for our loss func we compare the softmaxes to the 1 hot encoded targets. As we are 1 hot encoded we can ignore the other targets and just take the log.

In [7]:
def log_softmax(x):
    return (x.exp() / (x.exp().sum(-1, keepdim=True))).log()


log_softmax(pred)

tensor([[-2.2133, -2.4044, -2.4511,  ..., -2.2895, -2.1803, -2.3687],
        [-2.2759, -2.4379, -2.4651,  ..., -2.2692, -2.2075, -2.3586],
        [-2.3169, -2.4823, -2.4626,  ..., -2.1798, -2.1605, -2.2436],
        ...,
        [-2.2795, -2.4837, -2.4338,  ..., -2.2089, -2.1821, -2.2838],
        [-2.2791, -2.4615, -2.4212,  ..., -2.1493, -2.1313, -2.3059],
        [-2.4028, -2.4987, -2.5289,  ..., -2.1593, -2.0554, -2.2555]],
       grad_fn=<LogBackward0>)

In [8]:
# As log(a/b) = log(a) - log(b), we can simplify things to:


def log_softmax(x):
    return x - x.exp().sum(-1, keepdim=True).log()


log_softmax(pred)

tensor([[-2.2133, -2.4044, -2.4511,  ..., -2.2895, -2.1803, -2.3687],
        [-2.2759, -2.4379, -2.4651,  ..., -2.2692, -2.2075, -2.3586],
        [-2.3169, -2.4823, -2.4626,  ..., -2.1798, -2.1605, -2.2436],
        ...,
        [-2.2795, -2.4837, -2.4338,  ..., -2.2089, -2.1821, -2.2838],
        [-2.2791, -2.4615, -2.4212,  ..., -2.1493, -2.1313, -2.3059],
        [-2.4028, -2.4987, -2.5289,  ..., -2.1593, -2.0554, -2.2555]],
       grad_fn=<SubBackward0>)

In [9]:
# Its possible for the sum of the exponentials of big activations to overflow
# pytorch uses some tricks to solve this for use in one function
def log_softmax(x):
    return x - x.logsumexp(-1, keepdim=True)


sm_pred = log_softmax(pred)
pred

tensor([[ 0.1150, -0.0761, -0.1229,  ...,  0.0388,  0.1480, -0.0405],
        [ 0.0448, -0.1172, -0.1444,  ...,  0.0515,  0.1132, -0.0379],
        [-0.0048, -0.1703, -0.1506,  ...,  0.1323,  0.1515,  0.0684],
        ...,
        [ 0.0600, -0.1443, -0.0943,  ...,  0.1306,  0.1574,  0.0557],
        [ 0.0350, -0.1474, -0.1071,  ...,  0.1648,  0.1828,  0.0082],
        [-0.0907, -0.1867, -0.2168,  ...,  0.1528,  0.2567,  0.0566]],
       grad_fn=<AddmmBackward0>)

In [10]:
# How do we index into the preds we want
# The y values tell us targets, which are also the index
y_train[:5]

tensor([5, 0, 4, 1, 9])

In [11]:
# So for each i we want row i and col y_train[i]. eg for pred at row 0
sm_pred[0, y_train[0]]

tensor(-2.1109, grad_fn=<SelectBackward0>)

In [12]:
# We can do this for all of them like this
sm_pred[range(y_train.shape[0]), y_train]

tensor([-2.1109, -2.2759, -2.2591,  ..., -2.1821, -2.2817, -2.0554],
       grad_fn=<IndexBackward0>)

In [13]:
# So our cross entropy loss function looks like this
def nll(inp, target):
    return -inp[range(target.shape[0]), target].mean()


loss = nll(sm_pred, y_train)
loss

tensor(2.3124, grad_fn=<NegBackward0>)

In [14]:
# pytorch gives us both of these functions, NLL = negative log likelihood
F.nll_loss(F.log_softmax(pred, -1), y_train)

tensor(2.3124, grad_fn=<NllLossBackward0>)

In [15]:
# And a cross entropy function to do it all in one
F.cross_entropy(pred, y_train)

tensor(2.3124, grad_fn=<NllLossBackward0>)

## Training loop

In [16]:
loss_func = F.cross_entropy
batch_size = 64

# First we would run a minibatch
mini_batch = x_train[0:batch_size]
mini_batch_y = y_train[0:batch_size]

preds = model(mini_batch)
preds[0], preds.shape

(tensor([ 0.1150, -0.0761, -0.1229, -0.2367,  0.1981,  0.2174, -0.0872,  0.0388,
          0.1480, -0.0405], grad_fn=<SelectBackward0>),
 torch.Size([64, 10]))

In [17]:
# Then we would calculate the loss
loss_func(preds, mini_batch_y)

tensor(2.3258, grad_fn=<NllLossBackward0>)

In [18]:
# For each of our predictions what did we predict, we need the highest number from each set of preds and get its index
preds.argmax(dim=1)

tensor([5, 5, 5, 5, 8, 8, 8, 8, 4, 5, 8, 5, 8, 5, 5, 8, 5, 5, 5, 5, 7, 5, 5, 5,
        5, 8, 5, 8, 8, 7, 8, 5, 5, 5, 5, 5, 8, 5, 5, 8, 4, 5, 7, 8, 8, 4, 8, 5,
        5, 5, 5, 8, 8, 5, 5, 5, 8, 4, 8, 5, 5, 5, 5, 8])

In [19]:
# We can see which we got correct
preds.argmax(dim=1) == mini_batch_y

tensor([ True, False, False, False, False, False, False, False, False, False,
        False,  True, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,  True,
        False, False, False, False, False,  True, False, False, False, False,
        False, False,  True, False, False, False,  True,  True, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False])

In [20]:
# |export


# We can use it to calculate accuracy, this isnt needed for the NN but helps us understand whats going on
def accuracy(out, targets):
    return (out.argmax(dim=1) == targets).float().mean()

In [21]:
accuracy(preds, mini_batch_y)

tensor(0.1094)

In [22]:
# Now we want to do all of the batches for a bunch of epochs, updating the weights each time
import torch

lr = 0.5
epochs = 5

for epoch in range(epochs):
    for batch_start in range(0, n_examples, batch_size):
        # Get the slice
        sl = slice(batch_start, min(n_examples, batch_start + batch_size))

        batch_inp = x_train[sl]
        batch_targets = y_train[sl]

        # Run the model
        preds = model(batch_inp)
        loss = loss_func(preds, batch_targets)

        if batch_start == 0:
            print("loss:", loss.item(), "acc: ", accuracy(preds, batch_targets).item())

        loss.backward()

        # Update the weights
        with torch.no_grad():
            for layer in model.layers:
                if hasattr(layer, "weight"):
                    layer.weight -= layer.weight.grad * lr
                    layer.bias -= layer.bias.grad * lr
                    layer.weight.grad.zero_()
                    layer.bias.grad.zero_()

loss: 2.3258378505706787 acc:  0.109375
loss: 0.1734699010848999 acc:  0.9375
loss: 0.1648125946521759 acc:  0.953125
loss: 0.1431054323911667 acc:  0.953125
loss: 0.159428671002388 acc:  0.953125


## Parameters and optim

### Parameters

In [23]:
m1 = nn.Module()
m1.foo = nn.Linear(3, 4)

# Notice that the Module can automatically track all of the parameters in the
# layer that is assigned to it, how does that work?
m1, list(m1.named_children()), list(m1.parameters())

(Module(
   (foo): Linear(in_features=3, out_features=4, bias=True)
 ),
 [('foo', Linear(in_features=3, out_features=4, bias=True))],
 [Parameter containing:
  tensor([[-0.0402,  0.2272, -0.5387],
          [ 0.0365,  0.4104,  0.3436],
          [ 0.4707,  0.5491, -0.3349],
          [-0.5585, -0.5419,  0.1093]], requires_grad=True),
  Parameter containing:
  tensor([-0.1976,  0.5090,  0.3565,  0.4564], requires_grad=True)])

In [24]:
class MLP(nn.Module):
    def __init__(self, n_in, n_hidden, n_out):
        super().__init__()
        self.l1 = nn.Linear(n_in, n_hidden)
        self.l2 = nn.Linear(n_hidden, n_out)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.l2(self.relu(self.l1(x)))


# We normally use Module by inheriting from it and assigning it a bynch of layers

model = MLP(n_pixels, n_hidden, 10)
model, model.l1, list(model.named_children()), list(map(lambda x: x.shape, model.parameters()))

(MLP(
   (l1): Linear(in_features=784, out_features=50, bias=True)
   (l2): Linear(in_features=50, out_features=10, bias=True)
   (relu): ReLU()
 ),
 Linear(in_features=784, out_features=50, bias=True),
 [('l1', Linear(in_features=784, out_features=50, bias=True)),
  ('l2', Linear(in_features=50, out_features=10, bias=True)),
  ('relu', ReLU())],
 [torch.Size([50, 784]),
  torch.Size([50]),
  torch.Size([10, 50]),
  torch.Size([10])])

In [25]:
# So already we can rewrite our loop knowing we can just get all of the params from the model


def fit():
    for epoch in range(epochs):
        for batch_start in range(0, n_examples, batch_size):
            # Get the slice
            sl = slice(batch_start, min(n_examples, batch_start + batch_size))

            batch_inp = x_train[sl]
            batch_targets = y_train[sl]

            # Run the model
            preds = model(batch_inp)
            loss = loss_func(preds, batch_targets)

            if batch_start == 0:
                print("loss:", loss.item(), "acc: ", accuracy(preds, batch_targets).item())

            loss.backward()

            # Update the weights
            with torch.no_grad():
                for p in model.parameters():
                    p -= p.grad * lr
                model.zero_grad()


fit()

loss: 2.299609899520874 acc:  0.09375
loss: 0.1553519368171692 acc:  0.953125
loss: 0.13272824883460999 acc:  0.953125
loss: 0.10512181371450424 acc:  0.96875
loss: 0.09392286092042923 acc:  0.96875


In [26]:
# Here's how we would implement this if nn.Module didn't exist
class MyModule:
    def __init__(self, n_in, n_hidden, n_out):
        self._modules = {}
        self.l1 = nn.Linear(n_in, n_hidden)
        self.l2 = nn.Linear(n_hidden, n_out)

    # We hook into setattr and update _modules when some sets something new
    def __setattr__(self, key, value):
        if not key.startswith("_"):
            self._modules[key] = value
        super().__setattr__(key, value)

    # Print modules
    def __repr__(self):
        return f"{self._modules}"

    # Generate the parameters of every module
    def parameters(self):
        for mod in self.modules.values():
            for p in mod.parameters():
                yield p


myM = MyModule(n_pixels, n_hidden, 10)
myM

{'l1': Linear(in_features=784, out_features=50, bias=True), 'l2': Linear(in_features=50, out_features=10, bias=True)}

### Registering modules

We previously stored all our layers in a .layers member. How would we do that with nn.module. You can manually add layers using `add_module`.

In [27]:
from functools import reduce


class Model(nn.Module):
    def __init__(self, layers):
        super().__init__()
        self.layers = layers

        for idx, layer in enumerate(self.layers):
            self.add_module(f"layer_{idx}", layer)

    def __call__(self, x):
        return reduce(lambda val, layer: layer(val), self.layers, x)


layers = [nn.Linear(n_pixels, n_hidden), nn.ReLU(), nn.Linear(n_hidden, 10)]
model = Model(layers)

model

Model(
  (layer_0): Linear(in_features=784, out_features=50, bias=True)
  (layer_1): ReLU()
  (layer_2): Linear(in_features=50, out_features=10, bias=True)
)

#### nn.ModuleList

`nn.ModuleList` does this.

In [28]:
class SequentialModel(nn.Module):
    def __init__(self, layers):
        super().__init__()
        self.layers = nn.ModuleList(layers)

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x


model = SequentialModel(layers)
model

SequentialModel(
  (layers): ModuleList(
    (0): Linear(in_features=784, out_features=50, bias=True)
    (1): ReLU()
    (2): Linear(in_features=50, out_features=10, bias=True)
  )
)

In [29]:
fit()

loss: 2.341370105743408 acc:  0.015625
loss: 0.11918908357620239 acc:  0.96875
loss: 0.14029745757579803 acc:  0.96875
loss: 0.11141327768564224 acc:  0.953125
loss: 0.09101451188325882 acc:  0.953125


#### nn.Sequential

`nn.Sequential` already exists and does the lot for us.

In [30]:
layers = [nn.Linear(n_pixels, n_hidden), nn.ReLU(), nn.Linear(n_hidden, 10)]
model = nn.Sequential(*layers)
model

Sequential(
  (0): Linear(in_features=784, out_features=50, bias=True)
  (1): ReLU()
  (2): Linear(in_features=50, out_features=10, bias=True)
)

In [31]:
fit()

loss: 2.29732346534729 acc:  0.140625
loss: 0.1326911747455597 acc:  0.96875
loss: 0.12029773741960526 acc:  0.953125
loss: 0.11310149729251862 acc:  0.96875
loss: 0.1144457533955574 acc:  0.96875


### Optimizer

In [32]:
# We could put our optimisation steps into a class


class Optimizer:
    def __init__(self, params, lr=0.5):
        self.params = list(params)
        self.lr = lr

    def step(self):
        with torch.no_grad():
            for p in self.params:
                p -= p.grad * self.lr

    def zero_grad(self):
        for p in self.params:
            p.grad.data.zero_()

In [33]:
layers = [nn.Linear(n_pixels, n_hidden), nn.ReLU(), nn.Linear(n_hidden, 10)]
model = nn.Sequential(*layers)
opt = Optimizer(model.parameters())


# And now our training loop will look like this
def fit():
    for epoch in range(epochs):
        for batch_start in range(0, n_examples, batch_size):
            # Get the slice
            sl = slice(batch_start, min(n_examples, batch_start + batch_size))

            batch_inp = x_train[sl]
            batch_targets = y_train[sl]

            # Run the model
            preds = model(batch_inp)
            loss = loss_func(preds, batch_targets)

            if batch_start == 0:
                print("loss:", loss.item(), "acc: ", accuracy(preds, batch_targets).item())

            loss.backward()

            # Update the weights
            opt.step()
            opt.zero_grad()


fit()

loss: 2.326862096786499 acc:  0.140625
loss: 0.1396191567182541 acc:  0.953125
loss: 0.13144434988498688 acc:  0.953125
loss: 0.0914166048169136 acc:  0.984375
loss: 0.06785164028406143 acc:  0.984375


In [34]:
# pytorch gives use this as well
from torch import optim


def get_model():
    layers = [nn.Linear(n_pixels, n_hidden), nn.ReLU(), nn.Linear(n_hidden, 10)]
    model = nn.Sequential(*layers)
    opt = optim.SGD(model.parameters(), lr=lr)

    return model, opt


model, opt = get_model()


def fit():
    for epoch in range(epochs):
        for batch_start in range(0, n_examples, batch_size):
            # Get the slice
            sl = slice(batch_start, min(n_examples, batch_start + batch_size))

            batch_inp = x_train[sl]
            batch_targets = y_train[sl]

            # Run the model
            preds = model(batch_inp)
            loss = loss_func(preds, batch_targets)

            if batch_start == 0:
                print("loss:", loss.item(), "acc: ", accuracy(preds, batch_targets).item())

            loss.backward()

            # Update the weights
            opt.step()
            opt.zero_grad()


fit()

loss: 2.3191866874694824 acc:  0.109375
loss: 0.24167746305465698 acc:  0.90625
loss: 0.1405819058418274 acc:  0.953125
loss: 0.10254921019077301 acc:  0.96875
loss: 0.07964211702346802 acc:  0.96875


## Dataset and DataLoader

What if we want to more easily iterate through minimatches and x/y.

In [35]:
# |export


class Dataset:
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, i):
        return self.x[i], self.y[i]

In [36]:
train_ds = Dataset(x_train, y_train)
valid_ds = Dataset(x_valid, y_valid)

len(train_ds), len(valid_ds)

(50000, 10000)

In [37]:
# __getitem__ works with tensors
train_ds[0:5], train_ds[0:5][0].shape, train_ds[0:5][1].shape

((tensor([[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]),
  tensor([5, 0, 4, 1, 9])),
 torch.Size([5, 784]),
 torch.Size([5]))

In [38]:
# We can now simplify the slicing


def fit():
    for epoch in range(epochs):
        for batch_start in range(0, n_examples, batch_size):
            # Get the slice
            sl = slice(batch_start, min(n_examples, batch_start + batch_size))

            batch_inp, batch_targets = train_ds[batch_start : min(n_examples, batch_start + batch_size)]

            # Run the model
            preds = model(batch_inp)
            loss = loss_func(preds, batch_targets)

            if batch_start == 0:
                print("loss:", loss.item(), "acc: ", accuracy(preds, batch_targets).item())

            loss.backward()

            # Update the weights
            opt.step()
            opt.zero_grad()


model, opt = get_model()
fit()

loss: 2.2971205711364746 acc:  0.140625
loss: 0.14574120938777924 acc:  0.953125
loss: 0.17029577493667603 acc:  0.953125
loss: 0.15515412390232086 acc:  0.96875
loss: 0.15817898511886597 acc:  0.96875


In [39]:
# A DataLoader is an iterator that will help us with looping over the minibatches


class DataLoader:
    def __init__(self, dataset, batch_size):
        self.dataset = dataset
        self.batch_size = batch_size

    def __iter__(self):
        for batch_start in range(0, len(self.dataset), batch_size):
            yield self.dataset[batch_start : batch_start + self.batch_size]


train_dl = DataLoader(train_ds, batch_size)
valid_dl = DataLoader(valid_ds, batch_size)

In [40]:
# Now we can remove the slicing form our loop
def fit():
    for epoch in range(epochs):
        for batch_inp, batch_targets in train_dl:
            # Run the model
            preds = model(batch_inp)
            loss = loss_func(preds, batch_targets)

            loss.backward()

            # Update the weights
            opt.step()
            opt.zero_grad()
        print("loss:", loss.item(), "acc: ", accuracy(preds, batch_targets).item())


model, opt = get_model()
fit()

loss: 0.13620899617671967 acc:  0.9375
loss: 0.11623474955558777 acc:  1.0
loss: 0.08376095443964005 acc:  1.0
loss: 0.04591038078069687 acc:  1.0
loss: 0.018032046034932137 acc:  1.0


## Random sampling

What if we want our training set to be in a random order that differs every iteration (but keep the validations set the ssame)?

In [41]:
import random


# Iterates over indices that  may or may not be shuffled
class Sampler:
    def __init__(self, dataset, shuffle=False):
        self.N = len(dataset)
        self.shuffle = shuffle

    def __iter__(self):
        # List indices for the dataset
        indices = list(range(self.N))

        if self.shuffle:
            random.shuffle(indices)

        return iter(indices)


list(Sampler(train_ds))[:5], list(Sampler(train_ds, True))[:5]

([0, 1, 2, 3, 4], [47150, 9132, 33323, 34083, 44106])

In [42]:
import fastcore.all as fc


# Gets batches of the indices given by a sampler
class BatchSampler:
    def __init__(self, sampler, batch_size, drop_last=False):
        fc.store_attr()  # Stores all inputs as members with same name

    def __iter__(self):
        for chunk in fc.chunked(iter(self.sampler), self.batch_size, drop_last=self.drop_last):
            yield chunk


# Batches of 2, randomised
list(BatchSampler(Sampler(train_ds, True), 2))[:5]

[[42329, 1416], [11506, 12809], [11167, 45051], [47512, 42783], [12792, 21287]]

In [43]:
# We can update the DataLoader to use a BatchSampler rather than being told a batch size


# We need a collation function to stack all of the Xs and all of the Ys together into tensors
def collate(data):
    data_x, data_y = zip(*data)
    return torch.stack(data_x), torch.stack(data_y)


class DataLoader:
    def __init__(self, dataset, batch_sampler, collate_fn=collate):
        fc.store_attr()

    def __iter__(self):
        yield from (self.collate_fn(self.dataset[i] for i in b) for b in self.batch_sampler)


train_samp = BatchSampler(Sampler(train_ds, shuffle=True), batch_size)
valid_samp = BatchSampler(Sampler(valid_ds, shuffle=False), batch_size)

train_dl = DataLoader(train_ds, batch_sampler=train_samp)
valid_dl = DataLoader(valid_ds, batch_sampler=valid_samp)

batch = next(iter(train_dl))
batch, batch[0].shape, batch[1].shape

((tensor([[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]),
  tensor([4, 2, 2, 8, 0, 1, 4, 3, 7, 9, 8, 6, 6, 7, 4, 3, 0, 8, 3, 1, 4, 2, 2, 4,
          0, 5, 6, 2, 1, 4, 1, 2, 4, 4, 3, 3, 3, 2, 9, 7, 2, 1, 0, 8, 1, 8, 1, 1,
          3, 9, 4, 4, 3, 7, 3, 2, 1, 7, 0, 3, 0, 1, 1, 5])),
 torch.Size([64, 784]),
 torch.Size([64]))

In [44]:
model, opt = get_model()
fit()

loss: 0.12259667366743088 acc:  0.9375
loss: 0.020891010761260986 acc:  1.0
loss: 0.19957318902015686 acc:  0.9375
loss: 0.03951726853847504 acc:  1.0
loss: 0.08813390880823135 acc:  1.0


### Multiprocessing DataLoader

What if we want to run this in parallel to speed things up.

In [45]:
# We want to be able to process something a bit like this but in parallel
for o in map(train_ds.__getitem__, ([3, 6], [8, 1])):
    print(o)

(tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), tensor([1, 1]))
(tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), tensor([1, 0]))


In [46]:
# Pytorch gives us all of these and supports multiprocessing
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler, BatchSampler

train_samp = BatchSampler(RandomSampler(train_ds), batch_size, drop_last=False)
valid_samp = BatchSampler(SequentialSampler(valid_ds), batch_size, drop_last=False)

train_dl = DataLoader(train_ds, batch_sampler=train_samp, collate_fn=collate)
valid_dl = DataLoader(valid_ds, batch_sampler=valid_samp, collate_fn=collate)

model, opt = get_model()
fit()

loss: 0.3553428649902344 acc:  0.875
loss: 0.1931878924369812 acc:  0.9375
loss: 0.08816784620285034 acc:  0.9375
loss: 0.2127539962530136 acc:  0.9375
loss: 0.021999867632985115 acc:  1.0


In [47]:
# We can also pass a batch sampler as a sampler as we are able to index multiple things at once with no collate
# pytorch kust autogens a BatchSampler for us
train_dl = DataLoader(train_ds, sampler=train_samp)
valid_dl = DataLoader(valid_ds, sampler=valid_samp)

# As random sampling is so common, we can also just pass shuffle flags, and that dataset
train_dl = DataLoader(train_ds, shuffle=True, drop_last=True)
valid_dl = DataLoader(valid_ds, shuffle=False)

model, opt = get_model()
fit()

loss: 2.6751914024353027 acc:  0.0
loss: 1.8326674699783325 acc:  1.0
loss: 2.142854928970337 acc:  0.0
loss: 3.6747984886169434 acc:  0.0
loss: 3.0272738933563232 acc:  0.0


## Validation

We've not done anything with the validation set yet. Lets update fit to take in a validation set. We'll print the validation loss for each epoch.

In [48]:
# |export


def fit(epochs, model, loss_func, opt, train_dl, valid_dl):
    for epoch in range(epochs):
        # Some layers behave differently during training and validation so we have to tell them
        model.train()
        for batch_inp, batch_targets in train_dl:
            # Run the model
            preds = model(batch_inp)
            loss = loss_func(preds, batch_targets)

            loss.backward()

            # Update the weights
            opt.step()
            opt.zero_grad()

        # Now run the validation set
        model.eval()
        with torch.no_grad():
            total_loss = 0.0
            total_acc = 0.0
            count = 0

            for batch_inp, batch_targets in train_dl:
                preds = model(batch_inp)

                count += len(batch_inp)
                total_loss += loss_func(preds, batch_targets).item() * len(batch_inp)
                total_acc += accuracy(preds, batch_targets).item() * len(batch_inp)

        total_loss /= count
        total_acc /= count
        print(f"epoch: {epoch}, loss: {total_loss}, acc: {total_acc}")

    return total_loss, total_acc


def get_dls(train_ds, valid_ds, batch_size, **kwargs):
    return (
        DataLoader(train_ds, batch_size=batch_size, shuffle=True, **kwargs),
        DataLoader(valid_ds, batch_size=batch_size * 2, **kwargs),
    )

In [49]:
train_dl, valid_dl = get_dls(train_ds, valid_ds, 64)
model, opt = get_model()
loss, acc = fit(5, model, F.cross_entropy, opt, train_dl, valid_dl)

epoch: 0, loss: 0.27256460088729856, acc: 0.9146
epoch: 1, loss: 0.11975131210446358, acc: 0.9633
epoch: 2, loss: 0.10393305077791214, acc: 0.96722
epoch: 3, loss: 0.061809434617087246, acc: 0.98158
epoch: 4, loss: 0.06857460128337145, acc: 0.9781
