# Minibatch training

In [1]:
%install-location $cwd/swift-install
%install '.package(path: "$cwd/FastaiNotebook_02a_why_sqrt5")' FastaiNotebook_02a_why_sqrt5

Installing packages:
	.package(path: "/home/sgugger/git/fastai_dev/swift/FastaiNotebook_02a_why_sqrt5")
		FastaiNotebook_02a_why_sqrt5
With SwiftPM flags: []
Working in: /tmp/tmp1p3s2oam/swift-install
/home/sgugger/swift/usr/bin/swift-build: /home/sgugger/anaconda3/lib/libcurl.so.4: no version information available (required by /home/sgugger/swift/usr/lib/swift/linux/libFoundationNetworking.so)
[1/2] Compiling jupyterInstalledPackages jupyterInstalledPackages.swift
[2/3] Merging module jupyterInstalledPackages
Initializing Swift...
Installation complete!


In [2]:
//export
import Path
import TensorFlow

In [3]:
import FastaiNotebook_02a_why_sqrt5

Our labels will be integeres from now on, so to go with our `TF` abbreviation, we introduce `TI`.

In [4]:
// export
public typealias TI = Tensor<Int32>

### Data

We gather the MNIST data like in the previous notebooks.

In [5]:
var (xTrain,yTrain,xValid,yValid) = loadMNIST(path: Path.home/".fastai"/"data"/"mnist_tst", flat: true)

In [6]:
let trainMean = xTrain.mean()
let trainStd  = xTrain.std()

In [7]:
xTrain = normalize(xTrain, mean: trainMean, std: trainStd)
xValid = normalize(xValid, mean: trainMean, std: trainStd)

In [8]:
let (n,m) = (xTrain.shape[0],xTrain.shape[1])
let c = yTrain.max().scalarized()+1
print(n,m,c)

60000 784 10


We also define a simple model using our `FADense` layers.

In [9]:
let nHid = 50

In [10]:
public struct MyModel: Layer {
    public var layer1: FADense<Float>
    public var layer2: FADense<Float>
    
    public init(nIn: Int, nHid: Int, nOut: Int){
        layer1 = FADense(nIn, nHid, activation: relu)
        layer2 = FADense(nHid, nOut)
    }
    
    @differentiable
    public func callAsFunction(_ input: TF) -> TF {
        return input.sequenced(through: layer1, layer2)
    }
}

In [11]:
var model = MyModel(nIn: m, nHid: nHid, nOut: Int(c))

In [12]:
let pred = model(xTrain)

### Cross entropy loss

Before we can train our model, we need to have a loss function. We saw how to write `logSoftMax` from scratch in PyTorch, but let's do it once in swift too.

In [13]:
func logSoftmax<Scalar>(_ activations: Tensor<Scalar>) -> Tensor<Scalar> where Scalar:TensorFlowFloatingPoint{
    let exped = exp(activations) 
    return log(exped / exped.sum(alongAxes: -1))
}

In [14]:
let smPred = logSoftmax(pred)

In [15]:
yTrain[0..<3]

[5, 0, 4]


In [16]:
(smPred[0][5],smPred[1][0],smPred[2][4])

▿ 3 elements
  - .0 : -2.3993702
  - .1 : -1.1051472
  - .2 : -4.1959057


There is no fancy indexing yet so we have to use gather to get the indices we want out of our softmaxed predictions.

In [17]:
func nll<Scalar>(_ input: Tensor<Scalar>, _ target :TI) -> Tensor<Scalar> 
    where Scalar:TensorFlowFloatingPoint{
        let idx: TI = _Raw.range(start: Tensor(0), limit: Tensor(numericCast(target.shape[0])), delta: Tensor(1))
        let indices = _Raw.concat(concatDim: Tensor(1), [idx.expandingShape(at: 1), target.expandingShape(at: 1)])
        let losses = _Raw.gatherNd(params: input, indices: indices)
        return -losses.mean()
    }

In [18]:
nll(smPred, yTrain)

2.47692


In [19]:
time(repeating: 100){ let _ = nll(smPred, yTrain) }

average: 0.9569873800000002 ms,   min: 0.807825 ms,   max: 1.523853 ms


Simplify `logSoftmax` with log formulas.

In [20]:
func logSoftmax<Scalar>(_ activations: Tensor<Scalar>) -> Tensor<Scalar> where Scalar:TensorFlowFloatingPoint{
    return activations - log(exp(activations).sum(alongAxes: -1))
}

In [21]:
let smPred = logSoftmax(pred)

In [22]:
nll(smPred, yTrain)

2.47692


We know use the LogSumExp trick

In [23]:
smPred.max(alongAxes: -1).shape

▿ [60000, 1]
  ▿ dimensions : 2 elements
    - 0 : 60000
    - 1 : 1


In [24]:
func logSumExp<Scalar>(_ x: Tensor<Scalar>) -> Tensor<Scalar> where Scalar:TensorFlowFloatingPoint{
    let m = x.max(alongAxes: -1)
    return m + log(exp(x-m).sum(alongAxes: -1))
}

In [25]:
func logSoftmax<Scalar>(_ activations: Tensor<Scalar>) -> Tensor<Scalar> where Scalar:TensorFlowFloatingPoint{
    return activations - logSumExp(activations)
}

In [26]:
let smPred = logSoftmax(pred)

In [27]:
nll(smPred, yTrain)

2.47692


In S4TF nll loss is combined with softmax in:

In [28]:
let loss = softmaxCrossEntropy(logits: pred, labels: yTrain)
loss

2.47692


In [29]:
time(repeating: 100){ _ = nll(logSoftmax(pred), yTrain)}

average: 1.0426967300000003 ms,   min: 0.909199 ms,   max: 1.732276 ms


In [30]:
time(repeating: 100){ _ = softmaxCrossEntropy(logits: pred, labels: yTrain)}

average: 0.3443732900000001 ms,   min: 0.287463 ms,   max: 0.496555 ms


## Basic training loop

Basically the training loop repeats over the following steps:
- get the output of the model on a batch of inputs
- compare the output to the labels we have and compute a loss
- calculate the gradients of the loss with respect to every parameter of the model
- update said parameters with those gradients to make them a little bit better

In [31]:
// export
public func accuracy(_ output: TF, _ target: TI) -> TF{
    let corrects = TF(output.argmax(squeezingAxis: 1) .== target)
    return corrects.mean()
}

We have a raw model for now, so it should be as good as random: 10% accuracy.

In [32]:
print(accuracy(pred, yTrain))

0.10123333


So let's begin with a minibatch.

In [33]:
let bs=64                     // batch size
let xb = xTrain[0..<bs]       // a mini-batch from x
let preds = model(xb)         // predictions
print(preds[0], preds.shape)

[  1.4583064,  0.83652353,   1.0724789, -0.41211492,  -1.1187531,  0.70039546,   0.2566451,
  -0.6969434, -0.48053584,   1.9810728] [64, 10]


Then we can compute a loss

In [34]:
let yb = yTrain[0..<bs]
let loss = softmaxCrossEntropy(logits: preds, labels: yb)

In [35]:
print(accuracy(preds, yb))

0.125


In [36]:
let lr:Float = 0.5   // learning rate
let epochs = 1       // how many epochs to train for

Then we can get our loss and gradients.

Sometimes you'll see closures written this way (required if there is >1 statement in it).

In [37]:
let (loss, grads) = valueWithGradient(at: model) { model -> TF in
    let preds = model(xb)
    return softmaxCrossEntropy(logits: preds, labels: yb)
}

The full loop by hand would look like this:

In [38]:
for epoch in 1 ... epochs {
    for i in 0 ..< (n-1)/bs {
        let startIdx = i * bs
        let endIdx = startIdx + bs
        let xb = xTrain[startIdx..<endIdx]
        let yb = yTrain[startIdx..<endIdx]
        let (loss, grads) = valueWithGradient(at: model) {
            softmaxCrossEntropy(logits: $0(xb), labels: yb)
        }
        model.layer1.weight -= lr * grads.layer1.weight
        model.layer1.bias   -= lr * grads.layer1.bias
        model.layer2.weight -= lr * grads.layer2.weight
        model.layer2.bias   -= lr * grads.layer2.bias
    }
}

In [39]:
let preds = model(xValid)
accuracy(preds, yValid)

0.8961


`>80%` in one epoch, not too bad!

When we get the gradients of our model, we have another structure of the same type, and it's possible to perform basic arithmetic on those structures to make the update step super simple:

In [40]:
for epoch in 1 ... epochs {
    for i in 0 ..< (n-1)/bs {
        let startIdx = i * bs
        let endIdx = startIdx + bs
        let xb = xTrain[startIdx..<endIdx]
        let yb = yTrain[startIdx..<endIdx]
        let (loss, grads) = valueWithGradient(at: model) {
            softmaxCrossEntropy(logits: $0(xb), labels: yb)
        }
        model.move(along: grads.scaled(by: -lr))
    }
}

Then we can use a S4TF optimizer to do the step for us (which doesn't win much just yet - but will be nice when we can use momentum, adam, etc). An optimizer takes a `Layer` object and some gradients, and will perform the update.

In [41]:
let optimizer = SGD(for: model, learningRate: lr)

Here's a handy function (thanks for Alexis Gallagher) to grab a batch of indices at a time.

In [42]:
//export
public func batchedRanges(start:Int, end:Int, bs:Int) -> UnfoldSequence<Range<Int>,Int>
{
  return sequence(state: start) { (batchStart) -> Range<Int>? in
    let remaining = end - batchStart
    guard remaining > 0 else { return nil}
    let currentBs = min(bs,remaining)
    let batchEnd = batchStart.advanced(by: currentBs)
    defer {  batchStart = batchEnd  }
    return batchStart ..< batchEnd
  }
}

In [43]:
for epoch in 1 ... epochs{
    for b in batchedRanges(start: 0, end: n, bs: bs) {
        let (xb,yb) = (xTrain[b],yTrain[b])
        let (loss, grads) = valueWithGradient(at: model) {
            softmaxCrossEntropy(logits: $0(xb), labels: yb)
        }
        optimizer.update(&model, along: grads)
    }
}

## Dataset

We can create a swift `Dataset` from our arrays. It will automatically batch things for us:

In [44]:
// export
public struct DataBatch<Inputs: Differentiable & TensorGroup, Labels: TensorGroup>: TensorGroup {
    public var xb: Inputs
    public var yb: Labels
    
    public init(xb: Inputs, yb: Labels){ (self.xb,self.yb) = (xb,yb) }
    
    public var _tensorHandles: [_AnyTensorHandle] {
        xb._tensorHandles + yb._tensorHandles
    }
    
    public init<C: RandomAccessCollection>(_handles: C) where C.Element: _AnyTensorHandle {
        let xStart = _handles.startIndex
        let xEnd = _handles.index(
            xStart, offsetBy: Int(Inputs._tensorHandleCount))
        self.xb = Inputs.init(_handles: _handles[xStart..<xEnd])
        self.yb = Labels.init(_handles: _handles[xEnd..<_handles.endIndex])
    }
}

In [45]:
let trainDs = Dataset(elements:DataBatch(xb:xTrain, yb:yTrain)).batched(bs)

In [46]:
for epoch in 1...epochs{
    for batch in trainDs {
        let (loss, grads) = valueWithGradient(at: model) {
            softmaxCrossEntropy(logits: $0(xb), labels: yb)
        }
        optimizer.update(&model, along: grads)
    }
}

This `Dataset` can also do the shuffle for us:

In [47]:
for epoch in 1...epochs{
    for batch in trainDs.shuffled(sampleCount: yTrain.shape[0], randomSeed: 42){
        let (loss, grads) = valueWithGradient(at: model) {
            softmaxCrossEntropy(logits: $0(xb), labels: yb)
        }
        optimizer.update(&model, along: grads)
    }
}

### Training loop

With everything before, we can now write a generic training loop. It needs two generic types: the optimizer (`Opt`) and the labels (`Label`):

In [48]:
public func train<Opt: Optimizer, Label:TensorGroup>(
    _ model: inout Opt.Model,
    on ds: Dataset<DataBatch<Opt.Model.Input, Label>>,
    using opt: inout Opt,
    lossFunc: @escaping @differentiable (Opt.Model.Output, @noDerivative Label) -> Tensor<Opt.Scalar>
) where Opt.Model: Layer,
        Opt.Model.Input: TensorGroup,
        Opt.Scalar: TensorFlowFloatingPoint
{
    for batch in ds {
        let (loss, 𝛁model) = valueWithGradient(at: model) {
            lossFunc($0(batch.xb), batch.yb)
        }
        opt.update(&model, along: 𝛁model)
    }
}

In [49]:
var model = MyModel(nIn: m, nHid: nHid, nOut: Int(c))
var optimizer = SGD(for: model, learningRate: lr)

We can't use directly `sofmaxCrossEntropy` because it has a reduction parameter, so we define a fastai version.

In [50]:
//export
@differentiable(wrt: logits)
public func crossEntropy(_ logits: TF, _ labels: TI) -> TF {
    return softmaxCrossEntropy(logits: logits, labels: labels)
}

In [51]:
train(&model, on: trainDs, using: &optimizer, lossFunc: crossEntropy)

In [52]:
let preds = model(xValid)
accuracy(preds, yValid)

0.8904


### Export

In [53]:
import NotebookExport
let exporter = NotebookExport(Path.cwd/"03_minibatch_training.ipynb")
print(exporter.export(usingPrefix: "FastaiNotebook_"))

success
