# Minibatch training

## Data

In [1]:
# Download the MNIST dataset
import gzip
import pickle
import matplotlib as mpl
import matplotlib.pyplot as plt
from torch import tensor
from pathlib import Path
from urllib.request import urlretrieve

MNIST_URL = "https://github.com/mnielsen/neural-networks-and-deep-learning/blob/master/data/mnist.pkl.gz?raw=true"

data_path = Path("data/mnist")
data_path.mkdir(exist_ok=True)
gz_path = data_path / "mnist.pkl.gz"

mpl.rcParams["image.cmap"] = "gray"

if not gz_path.exists():
    urlretrieve(MNIST_URL, gz_path)

# File contains a tuple of tuples for the x and y, train and validation data
# Images are 28x28
with gzip.open(gz_path, "rb") as file:
    ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(file, encoding="latin-1")

# Put into tensors
x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])

## Model

In [2]:
# Lets define some constants that describe the data
n_examples, n_pixels = x_train.shape
possible_values = y_train.max() + 1

# How many nodes/activations/line thingys
n_hidden = 50

n_examples, n_pixels, possible_values

(50000, 784, tensor(10))

In [3]:
from torch import nn
import torch.nn.functional as F


class Model(nn.Module):
    def __init__(self, n_in, n_hidden, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in, n_hidden), nn.ReLU(), nn.Linear(n_hidden, n_out)]

    def __call__(self, inp):
        res = inp
        for layer in self.layers:
            res = layer(res)

        return res

In [4]:
model = Model(n_pixels, n_hidden, 10)
pred = model(x_train)
pred.shape

torch.Size([50000, 10])

## Cross entropy loss

We need to improve our loss function as MSE doesn't make sense, the distance between incorrecd predictions doesn't indicate how good/bad the prediction is. For example if the target is 2, then 3 isnt a better guess than 9.

We are outputting a pred for each possible number. As only one is possible at a time our targets are a one hot encoded matrix. If our targets are going to sum to 1, then it makes sense that our preds do too. We can calculate the softmax for each pred. Then for our loss func we compare the softmaxes to the 1 hot encoded targets. As we are 1 hot encoded we can ignore the other targets and just take the log.

In [5]:
def log_softmax(x):
    return (x.exp() / (x.exp().sum(-1, keepdim=True))).log()


log_softmax(pred)

tensor([[-2.1541, -2.4164, -2.2377,  ..., -2.2984, -2.3171, -2.4884],
        [-2.2712, -2.2574, -2.1670,  ..., -2.2426, -2.4145, -2.4205],
        [-2.1585, -2.3501, -2.1798,  ..., -2.3351, -2.3880, -2.3787],
        ...,
        [-2.1969, -2.3378, -2.2763,  ..., -2.2704, -2.3432, -2.4344],
        [-2.1629, -2.3427, -2.1967,  ..., -2.2335, -2.3517, -2.4363],
        [-2.2375, -2.3132, -2.1770,  ..., -2.2412, -2.3364, -2.4697]],
       grad_fn=<LogBackward0>)

In [6]:
# As log(a/b) = log(a) - log(b), we can simplify things to:


def log_softmax(x):
    return x - x.exp().sum(-1, keepdim=True).log()


log_softmax(pred)

tensor([[-2.1541, -2.4164, -2.2377,  ..., -2.2984, -2.3171, -2.4884],
        [-2.2712, -2.2574, -2.1670,  ..., -2.2426, -2.4145, -2.4205],
        [-2.1585, -2.3501, -2.1798,  ..., -2.3351, -2.3880, -2.3787],
        ...,
        [-2.1969, -2.3378, -2.2763,  ..., -2.2704, -2.3432, -2.4344],
        [-2.1629, -2.3427, -2.1967,  ..., -2.2335, -2.3517, -2.4363],
        [-2.2375, -2.3132, -2.1770,  ..., -2.2412, -2.3364, -2.4697]],
       grad_fn=<SubBackward0>)

In [7]:
# Its possible for the sum of the exponentials of big activations to overflow
# pytorch uses some tricks to solve this for use in one function
def log_softmax(x):
    return x - x.logsumexp(-1, keepdim=True)


sm_pred = log_softmax(pred)
pred

tensor([[ 0.1083, -0.1540,  0.0246,  ..., -0.0360, -0.0547, -0.2261],
        [ 0.0443,  0.0581,  0.1486,  ...,  0.0729, -0.0989, -0.1049],
        [ 0.1757, -0.0158,  0.1544,  ..., -0.0008, -0.0538, -0.0445],
        ...,
        [ 0.1097, -0.0312,  0.0303,  ...,  0.0362, -0.0366, -0.1278],
        [ 0.1225, -0.0573,  0.0887,  ...,  0.0519, -0.0663, -0.1509],
        [ 0.0573, -0.0185,  0.1177,  ...,  0.0535, -0.0417, -0.1750]],
       grad_fn=<AddmmBackward0>)

In [8]:
# How do we index into the preds we want
# The y values tell us targets, which are also the index
y_train[:5]

tensor([5, 0, 4, 1, 9])

In [9]:
# So for each i we want row i and col y_train[i]. eg for pred at row 0
sm_pred[0, y_train[0]]

tensor(-2.4615, grad_fn=<SelectBackward0>)

In [10]:
# We can do this for all of them like this
sm_pred[range(y_train.shape[0]), y_train]

tensor([-2.4615, -2.2712, -2.2086,  ..., -2.3432, -2.2861, -2.3364],
       grad_fn=<IndexBackward0>)

In [11]:
# So our cross entropy loss function looks like this
def nll(inp, target):
    return -inp[range(target.shape[0]), target].mean()


loss = nll(sm_pred, y_train)
loss

tensor(2.3122, grad_fn=<NegBackward0>)

In [12]:
# pytorch gives us both of these functions, NLL = negative log likelihood
F.nll_loss(F.log_softmax(pred, -1), y_train)

tensor(2.3122, grad_fn=<NllLossBackward0>)

In [13]:
# And a cross entropy function to do it all in one
F.cross_entropy(pred, y_train)

tensor(2.3122, grad_fn=<NllLossBackward0>)

## Training loop

In [14]:
loss_func = F.cross_entropy
batch_size = 64

# First we would run a minibatch
mini_batch = x_train[0:batch_size]
mini_batch_y = y_train[0:batch_size]

preds = model(mini_batch)
preds[0], preds.shape

(tensor([ 0.1083, -0.1540,  0.0246,  0.0735,  0.1174, -0.1992, -0.1280, -0.0360,
         -0.0547, -0.2261], grad_fn=<SelectBackward0>),
 torch.Size([64, 10]))

In [15]:
# Then we would calculate the loss
loss_func(preds, mini_batch_y)

tensor(2.3164, grad_fn=<NllLossBackward0>)

In [16]:
# For each of our predictions what did we predict, we need the highest number from each set of preds and get its index
preds.argmax(dim=1)

tensor([4, 2, 0, 0, 2, 3, 2, 2, 3, 4, 0, 3, 0, 2, 2, 0, 0, 4, 0, 3, 0, 0, 3, 0,
        3, 0, 3, 0, 0, 4, 4, 4, 2, 3, 4, 4, 2, 2, 3, 3, 2, 0, 3, 0, 3, 2, 0, 4,
        3, 4, 3, 0, 2, 4, 2, 2, 0, 3, 0, 4, 2, 3, 2, 7])

In [17]:
# We can see which we got correct
preds.argmax(dim=1) == mini_batch_y

tensor([False, False, False, False, False, False, False, False, False,  True,
        False, False, False, False, False, False, False, False, False, False,
        False,  True, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False,  True, False, False, False, False, False,
         True,  True, False,  True, False, False,  True, False, False, False,
        False, False, False, False])

In [18]:
# We can use it to calculate accuracy, this isnt needed for the NN but helps us understand whats going on
def accuracy(out, targets):
    return (out.argmax(dim=1) == targets).float().mean()


accuracy(preds, mini_batch_y)

tensor(0.1094)

In [19]:
# Now we want to do all of the batches for a bunch of epochs, updating the weights each time
import torch

lr = 0.5
epochs = 5

for epoch in range(epochs):
    for batch_start in range(0, n_examples, batch_size):
        # Get the slice
        sl = slice(batch_start, min(n_examples, batch_start + batch_size))

        batch_inp = x_train[sl]
        batch_targets = y_train[sl]

        # Run the model
        preds = model(batch_inp)
        loss = loss_func(preds, batch_targets)

        if batch_start == 0:
            print("loss:", loss.item(), "acc: ", accuracy(preds, batch_targets).item())

        loss.backward()

        # Update the weights
        with torch.no_grad():
            for layer in model.layers:
                if hasattr(layer, "weight"):
                    layer.weight -= layer.weight.grad * lr
                    layer.bias -= layer.bias.grad * lr
                    layer.weight.grad.zero_()
                    layer.bias.grad.zero_()

loss: 2.3164234161376953 acc:  0.109375
loss: 0.14103679358959198 acc:  0.953125
loss: 0.1338021159172058 acc:  0.953125
loss: 0.09388265013694763 acc:  0.96875
loss: 0.06046620383858681 acc:  0.96875


## Parameters and optim

### Parameters

In [20]:
m1 = nn.Module()
m1.foo = nn.Linear(3, 4)

# Notice that the Module can automatically track all of the parameters in the
# layer that is assigned to it, how does that work?
m1, list(m1.named_children()), list(m1.parameters())

(Module(
   (foo): Linear(in_features=3, out_features=4, bias=True)
 ),
 [('foo', Linear(in_features=3, out_features=4, bias=True))],
 [Parameter containing:
  tensor([[-0.2638, -0.4812, -0.5154],
          [-0.0276, -0.3032,  0.1277],
          [ 0.5214, -0.3499,  0.0212],
          [ 0.2150,  0.5617,  0.2342]], requires_grad=True),
  Parameter containing:
  tensor([ 0.0399, -0.1333,  0.3603,  0.4867], requires_grad=True)])

In [21]:
class MLP(nn.Module):
    def __init__(self, n_in, n_hidden, n_out):
        super().__init__()
        self.l1 = nn.Linear(n_in, n_hidden)
        self.l2 = nn.Linear(n_hidden, n_out)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.l2(self.relu(self.l1(x)))


# We normally use Module by inheriting from it and assigning it a bynch of layers

model = MLP(n_pixels, n_hidden, 10)
model, model.l1, list(model.named_children()), list(map(lambda x: x.shape, model.parameters()))

(MLP(
   (l1): Linear(in_features=784, out_features=50, bias=True)
   (l2): Linear(in_features=50, out_features=10, bias=True)
   (relu): ReLU()
 ),
 Linear(in_features=784, out_features=50, bias=True),
 [('l1', Linear(in_features=784, out_features=50, bias=True)),
  ('l2', Linear(in_features=50, out_features=10, bias=True)),
  ('relu', ReLU())],
 [torch.Size([50, 784]),
  torch.Size([50]),
  torch.Size([10, 50]),
  torch.Size([10])])

In [22]:
# So already we can rewrite our loop knowing we can just get all of the params from the model


def fit():
    for epoch in range(epochs):
        for batch_start in range(0, n_examples, batch_size):
            # Get the slice
            sl = slice(batch_start, min(n_examples, batch_start + batch_size))

            batch_inp = x_train[sl]
            batch_targets = y_train[sl]

            # Run the model
            preds = model(batch_inp)
            loss = loss_func(preds, batch_targets)

            if batch_start == 0:
                print("loss:", loss.item(), "acc: ", accuracy(preds, batch_targets).item())

            loss.backward()

            # Update the weights
            with torch.no_grad():
                for p in model.parameters():
                    p -= p.grad * lr
                model.zero_grad()


fit()

loss: 2.313939332962036 acc:  0.140625
loss: 0.12737786769866943 acc:  0.96875
loss: 0.17124103009700775 acc:  0.9375
loss: 0.10446500033140182 acc:  0.953125
loss: 0.08941523730754852 acc:  0.953125


In [23]:
# Here's how we would implement this if nn.Module didn't exist
class MyModule:
    def __init__(self, n_in, n_hidden, n_out):
        self._modules = {}
        self.l1 = nn.Linear(n_in, n_hidden)
        self.l2 = nn.Linear(n_hidden, n_out)

    # We hook into setattr and update _modules when some sets something new
    def __setattr__(self, key, value):
        if not key.startswith("_"):
            self._modules[key] = value
        super().__setattr__(key, value)

    # Print modules
    def __repr__(self):
        return f"{self._modules}"

    # Generate the parameters of every module
    def parameters(self):
        for mod in self.modules.values():
            for p in mod.parameters():
                yield p


myM = MyModule(n_pixels, n_hidden, 10)
myM

{'l1': Linear(in_features=784, out_features=50, bias=True), 'l2': Linear(in_features=50, out_features=10, bias=True)}

### Registering modules

We previously stored all our layers in a .layers member. How would we do that with nn.module. You can manually add layers using `add_module`.

In [24]:
from functools import reduce


class Model(nn.Module):
    def __init__(self, layers):
        super().__init__()
        self.layers = layers

        for idx, layer in enumerate(self.layers):
            self.add_module(f"layer_{idx}", layer)

    def __call__(self, x):
        return reduce(lambda val, layer: layer(val), self.layers, x)


layers = [nn.Linear(n_pixels, n_hidden), nn.ReLU(), nn.Linear(n_hidden, 10)]
model = Model(layers)

model

Model(
  (layer_0): Linear(in_features=784, out_features=50, bias=True)
  (layer_1): ReLU()
  (layer_2): Linear(in_features=50, out_features=10, bias=True)
)

#### nn.ModuleList

`nn.ModuleList` does this.

In [25]:
class SequentialModel(nn.Module):
    def __init__(self, layers):
        super().__init__()
        self.layers = nn.ModuleList(layers)

    def forward(self, x):
        for layer in self.layers:
            x = layer(x)
        return x


model = SequentialModel(layers)
model

SequentialModel(
  (layers): ModuleList(
    (0): Linear(in_features=784, out_features=50, bias=True)
    (1): ReLU()
    (2): Linear(in_features=50, out_features=10, bias=True)
  )
)

In [26]:
fit()

loss: 2.2983474731445312 acc:  0.125
loss: 0.162628173828125 acc:  0.9375
loss: 0.09918573498725891 acc:  0.96875
loss: 0.08054450154304504 acc:  0.96875
loss: 0.05816757306456566 acc:  0.96875


#### nn.Sequential

`nn.Sequential` already exists and does the lot for us.

In [27]:
layers = [nn.Linear(n_pixels, n_hidden), nn.ReLU(), nn.Linear(n_hidden, 10)]
model = nn.Sequential(*layers)
model

Sequential(
  (0): Linear(in_features=784, out_features=50, bias=True)
  (1): ReLU()
  (2): Linear(in_features=50, out_features=10, bias=True)
)

In [28]:
fit()

loss: 2.303898811340332 acc:  0.0625
loss: 0.11739269644021988 acc:  0.96875
loss: 0.07968293130397797 acc:  0.96875
loss: 0.08661330491304398 acc:  0.96875
loss: 0.06353913247585297 acc:  0.984375


### Optimizer

In [29]:
# We could put our optimisation steps into a class


class Optimizer:
    def __init__(self, params, lr=0.5):
        self.params = list(params)
        self.lr = lr

    def step(self):
        with torch.no_grad():
            for p in self.params:
                p -= p.grad * self.lr

    def zero_grad(self):
        for p in self.params:
            p.grad.data.zero_()

In [30]:
layers = [nn.Linear(n_pixels, n_hidden), nn.ReLU(), nn.Linear(n_hidden, 10)]
model = nn.Sequential(*layers)
opt = Optimizer(model.parameters())


# And now our training loop will look like this
def fit():
    for epoch in range(epochs):
        for batch_start in range(0, n_examples, batch_size):
            # Get the slice
            sl = slice(batch_start, min(n_examples, batch_start + batch_size))

            batch_inp = x_train[sl]
            batch_targets = y_train[sl]

            # Run the model
            preds = model(batch_inp)
            loss = loss_func(preds, batch_targets)

            if batch_start == 0:
                print("loss:", loss.item(), "acc: ", accuracy(preds, batch_targets).item())

            loss.backward()

            # Update the weights
            opt.step()
            opt.zero_grad()


fit()

loss: 2.2822203636169434 acc:  0.15625
loss: 0.1406371295452118 acc:  0.9375
loss: 0.11584998667240143 acc:  0.953125
loss: 0.15190747380256653 acc:  0.953125
loss: 0.17144279181957245 acc:  0.96875


In [31]:
# pytorch gives use this as well
from torch import optim


def get_model():
    layers = [nn.Linear(n_pixels, n_hidden), nn.ReLU(), nn.Linear(n_hidden, 10)]
    model = nn.Sequential(*layers)
    opt = optim.SGD(model.parameters(), lr=lr)

    return model, opt


model, opt = get_model()


def fit():
    for epoch in range(epochs):
        for batch_start in range(0, n_examples, batch_size):
            # Get the slice
            sl = slice(batch_start, min(n_examples, batch_start + batch_size))

            batch_inp = x_train[sl]
            batch_targets = y_train[sl]

            # Run the model
            preds = model(batch_inp)
            loss = loss_func(preds, batch_targets)

            if batch_start == 0:
                print("loss:", loss.item(), "acc: ", accuracy(preds, batch_targets).item())

            loss.backward()

            # Update the weights
            opt.step()
            opt.zero_grad()


fit()

loss: 2.292577028274536 acc:  0.109375
loss: 0.1324172019958496 acc:  0.953125
loss: 0.08873302489519119 acc:  0.96875
loss: 0.058555759489536285 acc:  0.984375
loss: 0.032626863569021225 acc:  1.0


## Dataset and DataLoader

What if we want to more easily iterate through minimatches and x/y.

In [32]:
class Dataset:
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, i):
        return self.x[i], self.y[i]


train_ds = Dataset(x_train, y_train)
valid_ds = Dataset(x_valid, y_valid)

len(train_ds), len(valid_ds)

(50000, 10000)

In [33]:
# __getitem__ works with tensors
train_ds[0:5], train_ds[0:5][0].shape, train_ds[0:5][1].shape

((tensor([[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]),
  tensor([5, 0, 4, 1, 9])),
 torch.Size([5, 784]),
 torch.Size([5]))

In [34]:
# We can now simplify the slicing


def fit():
    for epoch in range(epochs):
        for batch_start in range(0, n_examples, batch_size):
            # Get the slice
            sl = slice(batch_start, min(n_examples, batch_start + batch_size))

            batch_inp, batch_targets = train_ds[batch_start : min(n_examples, batch_start + batch_size)]

            # Run the model
            preds = model(batch_inp)
            loss = loss_func(preds, batch_targets)

            if batch_start == 0:
                print("loss:", loss.item(), "acc: ", accuracy(preds, batch_targets).item())

            loss.backward()

            # Update the weights
            opt.step()
            opt.zero_grad()


model, opt = get_model()
fit()

loss: 2.299999237060547 acc:  0.078125
loss: 0.12770475447177887 acc:  0.9375
loss: 0.15836617350578308 acc:  0.921875
loss: 0.08538192510604858 acc:  0.96875
loss: 0.03836856409907341 acc:  0.984375


In [35]:
# A DataLoader is an iterator that will help us with looping over the minibatches


class DataLoader:
    def __init__(self, dataset, batch_size):
        self.dataset = dataset
        self.batch_size = batch_size

    def __iter__(self):
        for batch_start in range(0, len(self.dataset), batch_size):
            yield self.dataset[batch_start : batch_start + self.batch_size]


train_dl = DataLoader(train_ds, batch_size)
valid_dl = DataLoader(valid_ds, batch_size)

In [36]:
# Now we can remove the slicing form our loop
def fit():
    for epoch in range(epochs):
        for batch_inp, batch_targets in train_dl:
            # Run the model
            preds = model(batch_inp)
            loss = loss_func(preds, batch_targets)

            loss.backward()

            # Update the weights
            opt.step()
            opt.zero_grad()
        print("loss:", loss.item(), "acc: ", accuracy(preds, batch_targets).item())


model, opt = get_model()
fit()

loss: 0.33352193236351013 acc:  0.9375
loss: 0.0953538790345192 acc:  0.9375
loss: 0.037897951900959015 acc:  1.0
loss: 0.030555281788110733 acc:  1.0
loss: 0.01890832744538784 acc:  1.0


## Random sampling

What if we want our training set to be in a random order that differs every iteration (but keep the validations set the ssame)?

In [37]:
import random


# Iterates over indices that  may or may not be shuffled
class Sampler:
    def __init__(self, dataset, shuffle=False):
        self.N = len(dataset)
        self.shuffle = shuffle

    def __iter__(self):
        # List indices for the dataset
        indices = list(range(self.N))

        if self.shuffle:
            random.shuffle(indices)

        return iter(indices)


list(Sampler(train_ds))[:5], list(Sampler(train_ds, True))[:5]

([0, 1, 2, 3, 4], [33876, 20379, 41070, 19168, 38762])

In [38]:
import fastcore.all as fc


# Gets batches of the indices given by a sampler
class BatchSampler:
    def __init__(self, sampler, batch_size, drop_last=False):
        fc.store_attr()  # Stores all inputs as members with same name

    def __iter__(self):
        for chunk in fc.chunked(iter(self.sampler), self.batch_size, drop_last=self.drop_last):
            yield chunk


# Batches of 2, randomised
list(BatchSampler(Sampler(train_ds, True), 2))[:5]

[[3318, 31199], [8674, 46640], [35082, 21457], [2137, 48900], [38909, 27076]]

In [39]:
# We can update the DataLoader to use a BatchSampler rather than being told a batch size


# We need a collation function to stack all of the Xs and all of the Ys together into tensors
def collate(data):
    data_x, data_y = zip(*data)
    return torch.stack(data_x), torch.stack(data_y)


class DataLoader:
    def __init__(self, dataset, batch_sampler, collate_fn=collate):
        fc.store_attr()

    def __iter__(self):
        yield from (self.collate_fn(self.dataset[i] for i in b) for b in self.batch_sampler)


train_samp = BatchSampler(Sampler(train_ds, shuffle=True), batch_size)
valid_samp = BatchSampler(Sampler(valid_ds, shuffle=False), batch_size)

train_dl = DataLoader(train_ds, batch_sampler=train_samp)
valid_dl = DataLoader(valid_ds, batch_sampler=valid_samp)

batch = next(iter(train_dl))
batch, batch[0].shape, batch[1].shape

((tensor([[0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          ...,
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.],
          [0., 0., 0.,  ..., 0., 0., 0.]]),
  tensor([2, 9, 9, 5, 1, 8, 6, 1, 6, 0, 6, 7, 6, 7, 0, 9, 6, 7, 3, 0, 8, 8, 0, 6,
          2, 4, 8, 4, 4, 9, 4, 8, 2, 9, 0, 7, 9, 9, 0, 1, 2, 3, 7, 9, 6, 0, 5, 3,
          5, 8, 7, 6, 5, 2, 0, 1, 1, 9, 2, 4, 3, 5, 1, 3])),
 torch.Size([64, 784]),
 torch.Size([64]))

In [40]:
model, opt = get_model()
fit()

loss: 0.23163193464279175 acc:  0.9375
loss: 0.021201523020863533 acc:  1.0
loss: 0.08637157082557678 acc:  1.0
loss: 0.007034916430711746 acc:  1.0
loss: 0.3597562313079834 acc:  0.8125


### Multiprocessing DataLoader

What if we want to run this in parallel to speed things up.

In [41]:
# We want to be able to process something a bit like this but in parallel
for o in map(train_ds.__getitem__, ([3, 6], [8, 1])):
    print(o)

(tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), tensor([1, 1]))
(tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), tensor([1, 0]))


In [52]:
# Pytorch gives us all of these and supports multiprocessing
from torch.utils.data import DataLoader, SequentialSampler, RandomSampler, BatchSampler

train_samp = BatchSampler(RandomSampler(train_ds), batch_size, drop_last=False)
valid_samp = BatchSampler(SequentialSampler(valid_ds), batch_size, drop_last=False)

train_dl = DataLoader(train_ds, batch_sampler=train_samp, collate_fn=collate)
valid_dl = DataLoader(valid_ds, batch_sampler=valid_samp, collate_fn=collate)

model, opt = get_model()
fit()

loss: 0.175296351313591 acc:  0.9375
loss: 0.05440215766429901 acc:  1.0
loss: 0.13612455129623413 acc:  0.9375
loss: 0.04176424816250801 acc:  1.0
loss: 0.0018899767892435193 acc:  1.0


In [57]:
# We can also pass a batch sampler as a sampler as we are able to index multiple things at once with no collate
# pytorch kust autogens a BatchSampler for us
train_dl = DataLoader(train_ds, sampler=train_samp)
valid_dl = DataLoader(valid_ds, sampler=valid_samp)

# As random sampling is so common, we can also just pass shuffle flags, and that dataset
train_dl = DataLoader(train_ds, shuffle=True, drop_last=True)
valid_dl = DataLoader(valid_ds, shuffle=False)

model, opt = get_model()
fit()