# Setup

Note: you may have to add/clone/checkout some of these packages

In [1]:
# this re-exports Transformations, StochasticOptimization, Penalties, and ObjectiveFunctions
using Learn

# my version of ML iteration.  Hopefully will be replaced with what's currently in MLDataUtils dev branch
using StochasticOptimization.Iteration

import MLDataUtils: rescale!

# for loading the data
import MNIST

# for plotting
using StatPlots, MLPlots
gr(leg=false, linealpha=0.5)

Plots.GRBackend()

# Helper functions

In [2]:
# create a one-hot matrix given class labels
# TODO: this should be added as a utility in MLDataUtils
function to_one_hot(y::AbstractVector)
    yint = map(yi->round(Int,yi)+1, y)
    nclasses = maximum(yint)
    hot = zeros(Float64, nclasses, length(y))
    for (i,yi) in enumerate(yint)
        hot[yi,i] = 1.0
    end
    hot
end

# randomly pick a subset of testdata (size = totcount) and compute the total loss
function my_test_loss(obj, testdata, totcount = 500)
    totloss = 0.0
    totcorrect = 0
    for (x,y) in each_obs(rand(each_obs(testdata), totcount))
        totloss += transform!(obj,y,x)

        # logistic version:
        # ŷ = output_value(obj.transformation)[1]
        # correct = (ŷ > 0.5 && y > 0.5) || (ŷ <= 0.5 && y < 0.5)

        # softmax version:
        ŷ = output_value(obj.transformation)
        chosen_idx = indmax(ŷ)
        correct = y[chosen_idx] > 0

        totcorrect += correct
    end
    totloss, totcorrect/totcount
end

my_test_loss (generic function with 2 methods)

# Set up the dataset

In [3]:
# our data:
x_train, y_train = MNIST.traindata()
x_test, y_test = MNIST.testdata()

# normalize the input data given μ/σ for the input training data
# note: scale both train and test sets using the train data
μ, σ = rescale!(x_train)
rescale!(x_test, μ, σ)

# convert y data to one-hot
y_train, y_test = map(to_one_hot, (y_train, y_test))

# optional: limit to only 0/1 digits for easier training
# to_isone(y::AbstractVector) = (z = Array(eltype(y), 1, length(y)); map!(yi->float(yi==1.0), z, y))
# y_train, y_test = map(to_isone, (y_train, y_test))
# train = filterobs(i -> y_train[i] < 1.5, x_train, y_train)
# test = filterobs(i -> y_test[i] < 1.5, x_test, y_test)

# store as tuples to make it easier
train = (x_train, y_train)
test = (x_test, y_test);

# Construct our model and objective function

In [4]:
nin, nh, nout = 784, [100,100], 10

# create a feedforward neural net with softplus activations and softmax output
t = nnet(nin, nout, nh, :softplus, :softmax)

# create an objective function with L2 penalty and an implicit cross entropy loss layer
penalty = L2Penalty(1e-5)
obj = objective(t, penalty)

ObjectiveFunctions.RegularizedObjective{Transformations.Chain{Float64,Transformations.Params{SubArray{Float64,1,Array{Float64,1},Tuple{UnitRange{Int64}},true},Tuple{},Tuple{}}},ObjectiveFunctions.CrossEntropy{Float64},Penalties.L2Penalty{Float64}}(Chain{Float64}(
   Affine{784-->100}
   softplus{100}
   Affine{100-->100}
   softplus{100}
   Affine{100-->10}
   softmax{10}
) ,ObjectiveFunctions.CrossEntropy{Float64}(10,Transformations.InputNode{:+,Float64,1}(Transformations.Node[Transformations.OutputNode{Float64,1}(Transformations.Node[Transformations.InputNode{:+,Float64,1}(#= circular reference @-4 =#)],[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0])],[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],Dict{Transformations.OutputNode{Float64,1},Int64}()),Transformations.InputNode{:+,Float64,1}(Transformations.Node[],[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0],Dict{Transformati

# optional: set up plotting

In [5]:
# the parts of the plot
chainplt = ChainPlot(t, maxn=100)
lossplt = TracePlot(title="Test Loss", ylim=(0,Inf))
accuracyplt = TracePlot(title="Accuracy", ylim=(0.6,1))
hmplt = heatmap(rand(28,28), ratio=1)

# put together the full plot... a ChainPlot with loss, accuracy, and the heatmap
plot(
    chainplt.plt,
    lossplt.plt,
    accuracyplt.plt,
    hmplt,
    size = (1200,800),
    layout=@layout([a; grid(1,3){0.2h}])
)

doanim = false
# anim = Animation()

# this is our custom callback which will be called on every 100 iterations
# note: we do the plotting here.
tracer = IterFunction((obj, i) -> begin
    # sample points from the test set and compute/save the loss
    @show i
    if mod1(i,500)==500
        totloss, accuracy = my_test_loss(obj, test, 200)
        @show totloss, accuracy
        push!(lossplt, i, totloss)
        push!(accuracyplt, i, accuracy)
    end

    # add transformation data
    update!(chainplt)

    # update the heatmap of the total outgoing weight from each pixel
    pixel_importance = reshape(sum(t[1].params.views[1],1), 28, 28)
    # pixel_importance = reshape(abs(input_grad(t)),28,28)  # another possible metric
    hmplt[1][1][:z].surf[:] = pixel_importance

    # handle animation frames/output
    if doanim
        lastframe = 5000
        if i < lastframe
            frame(anim)
        elseif i == lastframe
            gif(anim, fps=10)
        end
    end

    # display the plot
    gui()
end, every=100)

# trace once before we start learning to see initial values
tracer.f(obj, 0)

i = 0
(totloss,accuracy) = (553.208203251883,0.085)


# Create a MetaLearner

In [6]:
learner = make_learner(
    # averages the gradient over minibatches, updating params using the Adam method
    GradientLearner(1e-3, Adam()),

    # our custom iteration method
    tracer,

    # shorthand to add a MaxIter(10000)
    maxiter = 10000
)

StochasticOptimization.MetaLearner{Tuple{StochasticOptimization.GradientLearner{StochasticOptimization.FixedLR,StochasticOptimization.Adam{Float64},StochasticOptimization.GradientAverager},StochasticOptimization.IterFunction,StochasticOptimization.MaxIter}}((StochasticOptimization.GradientLearner{StochasticOptimization.FixedLR,StochasticOptimization.Adam{Float64},StochasticOptimization.GradientAverager}(StochasticOptimization.FixedLR(0.001),StochasticOptimization.Adam{Float64}(1.0e-8,0.9,0.999,#undef,#undef,#undef,#undef),StochasticOptimization.GradientAverager(#undef)),StochasticOptimization.IterFunction(#3,100),StochasticOptimization.MaxIter(10000)))

# Learn!

In [7]:
# do the learning... average over minibatches of size 5 for maxiter iterations
learn!(obj, learner, infinite_batches(train, size=5))

i = 100
i = 200
i = 300
i = 400
i = 500
(totloss,accuracy) = (61.21660254169188,0.885)
i = 600
i = 700
i = 800
i = 900
i = 1000
(totloss,accuracy) = (45.61347885553403,0.94)
i = 1100
i = 1200
i = 1300
i = 1400
i = 1500
(totloss,accuracy) = (67.33544983904031,0.91)
i = 1600
i = 1700
i = 1800
i = 1900
i = 2000
(totloss,accuracy) = (50.41961108075616,0.93)
i = 2100
i = 2200
i = 2300
i = 2400
i = 2500
(totloss,accuracy) = (41.661604603977565,0.93)
i = 2600
i = 2700
i = 2800
i = 2900
i = 3000
(totloss,accuracy) = (45.66251742528191,0.92)
i = 3100
i = 3200
i = 3300
i = 3400
i = 3500
(totloss,accuracy) = (65.63228608079748,0.905)
i = 3600
i = 3700
i = 3800
i = 3900
i = 4000
(totloss,accuracy) = (34.78429839828591,0.94)
i = 4100
i = 4200
i = 4300
i = 4400
i = 4500
(totloss,accuracy) = (49.777691320751586,0.955)
i = 4600
i = 4700
i = 4800
i = 4900
i = 5000
(totloss,accuracy) = (48.28631176577039,0.945)
i = 5100
i = 5200
i = 5300
i = 5400
i = 5500
(totloss,accuracy) = (92.717103022676,0.945)
i =

In [8]:
# save an image of the training output
png("/tmp/tmp")