# Implementation of Multilayer Perceptron from Scratch

:label:`sec_mlp_scratch`


Now that we have characterized 
multilayer perceptrons (MLPs) mathematically, 
let us try to implement one ourselves.

In [1]:
%use @file[../djl.json]
%use lets-plot
@file:DependsOn("org.apache.commons:commons-lang3:3.12.0")
import ai.djl.basicdataset.cv.classification.FashionMnist
import ai.djl.metric.Metrics

class Accumulator(n: Int) {
    val data = FloatArray(n) { 0f }


    /* Adds a set of numbers to the array */
    fun add(args: FloatArray) {
        for (i in 0..args.size - 1) {
            data[i] += args[i]
        }
    }

    /* Resets the array */
    fun reset() {
        data.fill(0f)
    }

    /* Returns the data point at the given index */
    fun get(index: Int): Float {
        return data[index]
    }
}

fun getLong(nm: String, n: Long): Long {
    val name = System.getProperty(nm)
    return if (null == name) n.toLong() else name.toLong()
}

object Training {

    fun linreg(X: NDArray, w: NDArray, b: NDArray): NDArray {
        return X.dot(w).add(b);
    }

    fun squaredLoss(yHat: NDArray, y: NDArray): NDArray {
        return (yHat.sub(y.reshape(yHat.getShape())))
            .mul((yHat.sub(y.reshape(yHat.getShape()))))
            .div(2);
    }

    fun sgd(params: NDList, lr: Float, batchSize: Int) {
        for (param in params) {
            // Update param in place.
            // param = param - param.gradient * lr / batchSize
            // val ind = params.indexOf(param)
            // params.rep
            // params.set(ind, param.sub(param.getGradient().mul(lr).div(batchSize)))
            param.subi(param.getGradient().mul(lr).div(batchSize));
        }
    }

    /**
     * Allows to do gradient calculations on a subManager. This is very useful when you are training
     * on a lot of epochs. This subManager could later be closed and all NDArrays generated from the
     * calculations in this function will be cleared from memory when subManager is closed. This is
     * always a great practice but the impact is most notable when there is lot of data on various
     * epochs.
     */
    fun sgd(params: NDList, lr: Float, batchSize: Int, subManager: NDManager) {
        for (param in params) {
            // Update param in place.
            // param = param - param.gradient * lr / batchSize
            val gradient = param.getGradient()
            gradient.attach(subManager);
            param.subi(gradient.mul(lr).div(batchSize))
        }
    }

    fun accuracy(yHat: NDArray, y: NDArray): Float {
        // Check size of 1st dimension greater than 1
        // to see if we have multiple samples
        if (yHat.getShape().size(1) > 1) {
            // Argmax gets index of maximum args for given axis 1
            // Convert yHat to same dataType as y (int32)
            // Sum up number of true entries
            return yHat.argMax(1)
                .toType(DataType.INT32, false)
                .eq(y.toType(DataType.INT32, false))
                .sum()
                .toType(DataType.FLOAT32, false)
                .getFloat();
        }
        return yHat.toType(DataType.INT32, false)
            .eq(y.toType(DataType.INT32, false))
            .sum()
            .toType(DataType.FLOAT32, false)
            .getFloat();
    }

    fun trainingChapter6(
        trainIter: ArrayDataset,
        testIter: ArrayDataset,
        numEpochs: Int,
        trainer: Trainer,
        evaluatorMetrics: MutableMap<String, DoubleArray>
    ): Double {

        trainer.setMetrics(Metrics())

        EasyTrain.fit(trainer, numEpochs, trainIter, testIter)

        val metrics = trainer.getMetrics()

        trainer.getEvaluators()
            .forEach { evaluator ->
                {
                    evaluatorMetrics.put(
                        "train_epoch_" + evaluator.getName(),
                        metrics.getMetric("train_epoch_" + evaluator.getName()).stream()
                            .mapToDouble { x -> x.getValue() }
                            .toArray())
                    evaluatorMetrics.put(
                        "validate_epoch_" + evaluator.getName(),
                        metrics
                            .getMetric("validate_epoch_" + evaluator.getName())
                            .stream()
                            .mapToDouble { x -> x.getValue() }
                            .toArray())
                }
            }

        return metrics.mean("epoch")
    }

    /* Softmax-regression-scratch */
    fun evaluateAccuracy(net: UnaryOperator<NDArray>, dataIterator: Iterable<Batch>): Float {
        val metric = Accumulator(2) // numCorrectedExamples, numExamples
        for (batch in dataIterator) {
            val X = batch.getData().head()
            val y = batch.getLabels().head()
            metric.add(floatArrayOf(accuracy(net.apply(X), y), y.size().toFloat()))
            batch.close()
        }
        return metric.get(0) / metric.get(1)
    }
    /* End Softmax-regression-scratch */

    /* MLP */
    /* Evaluate the loss of a model on the given dataset */
    fun evaluateLoss(
        net: UnaryOperator<NDArray>,
        dataIterator: Iterable<Batch>,
        loss: BinaryOperator<NDArray>
    ): Float {
        val metric = Accumulator(2) // sumLoss, numExamples

        for (batch in dataIterator) {
            val X = batch . getData ().head();
            val y = batch . getLabels ().head();
            metric.add(
                floatArrayOf(loss.apply(net.apply(X), y).sum().getFloat(), y.size().toFloat()) )
            batch.close()
        }
        return metric.get(0) / metric.get(1)
    }
    /* End MLP */
}

// %load ../utils/djl-imports
// %load ../utils/plot-utils
// %load ../utils/DataPoints.java
// %load ../utils/Training.java
// %load ../utils/Accumulator.java

In [2]:
import ai.djl.basicdataset.cv.classification.*
import org.apache.commons.lang3.ArrayUtils

To compare against our previous results
achieved with (linear) softmax regression
(:numref:`sec_softmax_scratch`),
we will continue work with 
the Fashion-MNIST image classification dataset 
(:numref:`sec_fashion_mnist`).

In [3]:
val batchSize = 256;

val trainIter = FashionMnist.builder()
        .optUsage(Dataset.Usage.TRAIN)
        .setSampling(batchSize, true)
        .optLimit(getLong("DATASET_LIMIT", Long.MAX_VALUE))
        .build();


val testIter = FashionMnist.builder()
        .optUsage(Dataset.Usage.TEST)
        .setSampling(batchSize, true)
        .optLimit(getLong("DATASET_LIMIT", Long.MAX_VALUE))
        .build();
                            
trainIter.prepare();
testIter.prepare();

## Initializing Model Parameters

Recall that Fashion-MNIST contains $10$ classes,
and that each image consists of a $28 \times 28 = 784$
grid of (black and white) pixel values.
Again, we will disregard the spatial structure
among the pixels (for now),
so we can think of this as simply a classification dataset
with $784$ input features and $10$ classes.
To begin, we will implement an MLP
with one hidden layer and $256$ hidden units.
Note that we can regard both of these quantities
as *hyperparameters* and ought in general
to set them based on performance on validation data.
Typically, we choose layer widths in powers of $2$,
which tend to be computationally efficient because
of how memory is alotted and addressed in hardware.

Again, we will represent our parameters with several `NDArray`s.
Note that *for every layer*, we must keep track of
one weight matrix and one bias vector.
As always, we call `attachGradient()` to allocate memory
for the gradients (of the loss) with respect to these parameters.

In [4]:
val  numInputs = 784L
val  numOutputs = 10L
val numHiddens = 256L

val manager = NDManager.newBaseManager();

val W1 = manager.randomNormal(
                        0f, 0.01f, Shape(numInputs, numHiddens), DataType.FLOAT32);
val b1 = manager.zeros(Shape(numHiddens));
val W2 = manager.randomNormal(
                        0f, 0.01f, Shape(numHiddens, numOutputs), DataType.FLOAT32);
val b2 = manager.zeros(Shape(numOutputs));

val params = NDList(W1, b1, W2, b2);

for (param in params) {
    param.setRequiresGradient(true);
}

## Activation Function

To make sure we know how everything works,
we will implement the ReLU activation ourselves
using the `maximum` function rather than 
invoking `Activation.relu` directly.

In [5]:
fun relu(X: NDArray): NDArray {
    return X.maximum(0f);
}

## The model

Because we are disregarding spatial structure, 
we `reshape` each 2D image into 
a flat vector of length  `numInputs`.
Finally, we implement our model 
with just a few lines of code.

In [6]:
fun net(X: NDArray): NDArray {
    val X0 = X.reshape(Shape(-1, numInputs));
    val H = relu(X0.dot(W1).add(b1));
    return H.dot(W2).add(b2);
}

## The Loss Function

To ensure numerical stability,
and because we already implemented
the softmax function from scratch
(:numref:`sec_softmax_scratch`),
we leverage Gluon's integrated function
for calculating the softmax and cross-entropy loss.
Recall our earlier discussion of these intricacies 
(:numref:`sec_mlp`).
We encourage the interested reader 
to examine the source code for `Loss.SoftmaxCrossEntropyLoss`
to deepen their knowledge of implementation details.

In [7]:
val loss = Loss.softmaxCrossEntropyLoss()

## Training

Fortunately, the training loop for MLPs
is exactly the same as for softmax regression.

We run the training like how we did in Chapter 3, 
(see :numref:`sec_softmax_scratch`),
setting the number of epochs to $10$ 
and the learning rate to $0.5$.

In [8]:
val numEpochs = Integer.getInteger("MAX_EPOCH", 10);
val lr = 0.5f;

val trainLoss = mutableListOf<Float>()
val trainAccuracy = mutableListOf<Float>()
val testAccuracy = mutableListOf<Float>()
val epochCount = mutableListOf<Int>()

In [9]:
var epochLoss = 0f;
var accuracyVal = 0f;

for (epoch in 1.. numEpochs) {
    print("Running epoch " + epoch + "...... ");
    // Iterate over dataset
    for (batch in trainIter.getData(manager)) {

        val X = batch.getData().head();
        val y = batch.getLabels().head();

        val gc = Engine.getInstance().newGradientCollector()
            val yHat = net(X); // net function call

            val lossValue = loss.evaluate(NDList(y), NDList(yHat));
            val l = lossValue.mul(batchSize);

            accuracyVal += Training.accuracy(yHat, y);
            epochLoss += l.sum().getFloat();

            gc.backward(l); // gradient calculation
        gc.close()

        batch.close();
        Training.sgd(params, lr, batchSize); // updater
    }

    trainLoss.add(epochLoss/trainIter.size())
    trainAccuracy.add(accuracyVal/trainIter.size())

    epochLoss = 0f;
    accuracyVal = 0f;    
    // testing now
    for (batch in testIter.getData(manager)) {

        val X = batch.getData().head();
        val y = batch.getLabels().head();

        val yHat = net(X); // net function call
        accuracyVal += Training.accuracy(yHat, y);
    }

    testAccuracy.add(accuracyVal/testIter.size())
    epochCount.add(epoch)
    accuracyVal = 0f;
    println("Finished epoch " + epoch);
}

println("Finished training!");

Running epoch 1...... Finished epoch 1
Running epoch 2...... Finished epoch 2
Running epoch 3...... Finished epoch 3
Running epoch 4...... Finished epoch 4
Running epoch 5...... Finished epoch 5
Running epoch 6...... Finished epoch 6
Running epoch 7...... Finished epoch 7
Running epoch 8...... Finished epoch 8
Running epoch 9...... Finished epoch 9
Running epoch 10...... Finished epoch 10
Finished training!


In [10]:
val trainLabel = Array<String>(trainLoss.size) { "train loss" } 
val accLabel = Array<String>(trainAccuracy.size) { "train acc" }
val testLabel = Array<String>(testAccuracy.size) {"test acc"}

val data = mapOf( "epochCount" to epochCount + epochCount + epochCount,
                "loss" to trainLoss + trainAccuracy + testAccuracy,
                "lossLabel" to trainLabel + accLabel + testLabel)
var plot = letsPlot(data)
plot += geomLine { x = "epochCount" ; y = "loss" ; color = "lossLabel"}
plot + ggsize(500, 500)

## Summary

We saw that implementing a simple MLP is easy, 
even when done manually.
That said, with a large number of layers, 
this can still get messy 
(e.g., naming and keeping track of our model's parameters, etc).

## Exercises

1. Change the value of the hyperparameter `numHiddens` and see how this hyperparameter influences your results. Determine the best value of this hyperparameter, keeping all others constant.
1. Try adding an additional hidden layer to see how it affects the results.
1. How does changing the learning rate alter your results? Fixing the model architecture and other hyperparameters (including number of epochs), what learning rate gives you the best results? 
1. What is the best result you can get by optimizing over all the parameters (learning rate, iterations, number of hidden layers, number of hidden units per layer) jointly? 
1. Describe why it is much more challenging to deal with multiple hyperparameters. 
1. What is the smartest strategy you can think of for structuring a search over multiple hyperparameters?

