In [1]:
#Exploring multiclass neural network

using Flux

function nn_multiclass_classification(X, Y, n_classes; numiters=40)

    d = size(X,2); m = size(Y,2)

    # Feel free to play with this model, add layers, change layer size
    # change activation, etc.
    model = Chain(
        Dense(d, 2*d, relu),
        Dense(2*d, n_classes, sigmoid))
    # The model outputs n_classes probabilities. We will choose the highest as our classification

    data = zip(eachrow(X), eachrow(Y))
	
    # logitcrossentropy is used for training classification models
    loss(x, y) = logitcrossentropy(model(x), y)
	
    # Training
    # Gradient descent optimiser with learning rate 0.5
    optimiser = Descent(0.5)

    # These lines all handle the callback which prints the loss
    ctr = 0    
    function callback()
        if ctr % 128 == 0 # controls the frequency of printing the loss
            println("Loss: $(sum([loss(x,y) for (x,y) in data]))")
        end
        ctr += 1
    end
    # Done with callback
    println("Starting training.")
    
    #Flux.train!(loss, params(model), train_data, optimiser)
	Flux.@epochs numiters Flux.train!(loss, Flux.params(model), data, optimiser, cb=callback)

    return model
end

nn_multiclass_classification (generic function with 1 method)

In [2]:
# Form: y_hat = g^2(g^1(x))
# There are 2 layers.
# Layer 1: 30 input, 60 output, relu activation, 
# A1:16*32, b1:32*1
# Output layer: 60 input, 7 output, sigmoid activation
# A2:32*7, b2:7*1
# There're 775 scalar parameters.

In [2]:
using Random
Random.seed!(0)

include("readclassjson.jl")
data = readclassjson("zoo.json")

X = data["X"]
y = data["y"]

df = hcat(X, y)
df = df[shuffle(1:end), :]

101×17 Matrix{Float64}:
 1.0  0.0  0.0  1.0  0.0  0.0  1.0  1.0  …  0.0  0.0  4.0  1.0  0.0  1.0  1.0
 0.0  0.0  1.0  0.0  1.0  0.0  1.0  0.0     0.0  0.0  6.0  0.0  0.0  0.0  6.0
 1.0  0.0  0.0  1.0  0.0  0.0  0.0  1.0     0.0  0.0  4.0  1.0  0.0  1.0  1.0
 0.0  0.0  1.0  0.0  0.0  1.0  1.0  1.0     0.0  1.0  0.0  1.0  0.0  1.0  4.0
 1.0  0.0  0.0  1.0  0.0  0.0  1.0  1.0     0.0  0.0  4.0  1.0  0.0  1.0  1.0
 0.0  1.0  1.0  0.0  0.0  0.0  0.0  0.0  …  0.0  0.0  2.0  1.0  0.0  1.0  2.0
 1.0  0.0  0.0  1.0  0.0  0.0  0.0  1.0     0.0  0.0  2.0  1.0  0.0  0.0  1.0
 0.0  0.0  1.0  0.0  0.0  1.0  1.0  0.0     0.0  0.0  4.0  0.0  0.0  0.0  7.0
 0.0  0.0  1.0  0.0  0.0  1.0  0.0  1.0     0.0  1.0  0.0  1.0  1.0  0.0  4.0
 0.0  1.0  1.0  0.0  1.0  0.0  0.0  0.0     0.0  0.0  2.0  1.0  1.0  0.0  2.0
 0.0  0.0  1.0  0.0  0.0  1.0  1.0  0.0  …  0.0  0.0  8.0  0.0  0.0  1.0  7.0
 0.0  1.0  1.0  0.0  1.0  1.0  0.0  0.0     0.0  0.0  2.0  1.0  0.0  0.0  2.0
 0.0  0.0  1.0  0.0  0.0  1.0  1.0  1.0 

In [4]:
X_train = df[1:90, 1:16]
y_train = df[1:90, 17]

X_eval = df[91:101, 1:16]
y_eval = df[91:101, 17]

11-element Vector{Float64}:
 1.0
 6.0
 1.0
 6.0
 4.0
 1.0
 5.0
 1.0
 7.0
 1.0
 1.0

In [5]:
using Flux: onehotbatch
using LinearAlgebra
oh_y_train = transpose(onehotbatch(y_train, 1:7))
oh_y_eval = transpose(onehotbatch(y_eval, 1:7))

11×7 transpose(OneHotMatrix(::Vector{UInt32})) with eltype Bool:
 1  0  0  0  0  0  0
 0  0  0  0  0  1  0
 1  0  0  0  0  0  0
 0  0  0  0  0  1  0
 0  0  0  1  0  0  0
 1  0  0  0  0  0  0
 0  0  0  0  1  0  0
 1  0  0  0  0  0  0
 0  0  0  0  0  0  1
 1  0  0  0  0  0  0
 1  0  0  0  0  0  0

In [6]:
using Flux: logitcrossentropy
model = nn_multiclass_classification(X_train, oh_y_train, 7)

Starting training.
Loss: 157.34704780662432
Loss: 131.95798971235558
Loss: 117.67018785595616
Loss: 116.64583801719067
Loss: 113.40637195228992
Loss: 111.62733317936026
Loss: 111.25739498842673
Loss: 111.36203003174218
Loss: 111.21900343589202
Loss: 112.16770168939797
Loss: 111.00145423187939
Loss: 110.32805390714424
Loss: 110.08663963510052
Loss: 109.99214987976954
Loss: 109.73728415643173
Loss: 109.9245545731285
Loss: 110.29523810242743
Loss: 109.74570828643144
Loss: 109.4251290667666
Loss: 109.80636662213217
Loss: 109.40812732142328
Loss: 109.50122072624521
Loss: 109.22094764315032
Loss: 109.35621126571516
Loss: 110.5378414308074
Loss: 109.91501869149582
Loss: 109.30002761012895
Loss: 109.14050597017932
Loss: 109.1337480950086


┌ Info: Epoch 1
└ @ Main C:\Users\surface\.julia\packages\Flux\6Q5r4\src\optimise\train.jl:154
┌ Info: Epoch 2
└ @ Main C:\Users\surface\.julia\packages\Flux\6Q5r4\src\optimise\train.jl:154
┌ Info: Epoch 3
└ @ Main C:\Users\surface\.julia\packages\Flux\6Q5r4\src\optimise\train.jl:154
┌ Info: Epoch 4
└ @ Main C:\Users\surface\.julia\packages\Flux\6Q5r4\src\optimise\train.jl:154
┌ Info: Epoch 5
└ @ Main C:\Users\surface\.julia\packages\Flux\6Q5r4\src\optimise\train.jl:154
┌ Info: Epoch 6
└ @ Main C:\Users\surface\.julia\packages\Flux\6Q5r4\src\optimise\train.jl:154
┌ Info: Epoch 7
└ @ Main C:\Users\surface\.julia\packages\Flux\6Q5r4\src\optimise\train.jl:154
┌ Info: Epoch 8
└ @ Main C:\Users\surface\.julia\packages\Flux\6Q5r4\src\optimise\train.jl:154
┌ Info: Epoch 9
└ @ Main C:\Users\surface\.julia\packages\Flux\6Q5r4\src\optimise\train.jl:154
┌ Info: Epoch 10
└ @ Main C:\Users\surface\.julia\packages\Flux\6Q5r4\src\optimise\train.jl:154
┌ Info: Epoch 11
└ @ Main C:\Users\surface\.julia

Chain(
  Dense(16 => 32, relu),                [90m# 544 parameters[39m
  Dense(32 => 7, σ),                    [90m# 231 parameters[39m
) [90m                  # Total: 4 arrays, [39m775 parameters, 3.277 KiB.

In [10]:
function confusionMatrix(y_hat, y)
    cm = zeros(7, 7)
    for i in 1:size(y, 1)
        temp = y_hat[7*i-6:7*i]
        pred = argmax(temp)
        cm[pred, Int(y[i])] += 1
    end
    return cm
end

function accuracy(cm, y)
    count = 0
    for i in 1:size(cm, 1)
        count += cm[i, i]
    end
    return count / size(y, 1)
end

predictall(model,U) = vcat([model(x) for x in eachrow(U)]...)

cm_train = confusionMatrix(predictall(model,X_train), y_train)
cm_eval = confusionMatrix(predictall(model,X_eval), y_eval)

display(cm_train)
display(cm_eval)

println(accuracy(cm_train, y_train))
println(accuracy(cm_eval, y_eval))

7×7 Matrix{Float64}:
 35.0   0.0  0.0   0.0  1.0  0.0  0.0
  0.0  20.0  1.0   0.0  0.0  0.0  0.0
  0.0   0.0  4.0   0.0  1.0  0.0  0.0
  0.0   0.0  0.0  12.0  0.0  0.0  0.0
  0.0   0.0  0.0   0.0  0.0  0.0  0.0
  0.0   0.0  0.0   0.0  0.0  0.0  0.0
  0.0   0.0  0.0   0.0  1.0  6.0  9.0

7×7 Matrix{Float64}:
 6.0  0.0  0.0  0.0  1.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  1.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  0.0  0.0
 0.0  0.0  0.0  0.0  0.0  2.0  1.0

0.8888888888888888
0.7272727272727273


In [11]:
function nn_multiclass_classification_more(X, Y, n_classes; numiters=40)

    d = size(X,2); m = size(Y,2)

    # Feel free to play with this model, add layers, change layer size
    # change activation, etc.
    model = Chain(
        Dense(d, 2*d, relu),
        Dense(2*d, 2*d, relu),
        Dense(2*d, n_classes, sigmoid))
    # The model outputs n_classes probabilities. We will choose the highest as our classification

    data = zip(eachrow(X), eachrow(Y))
	
    # logitcrossentropy is used for training classification models
    loss(x, y) = logitcrossentropy(model(x), y)
	
    # Training
    # Gradient descent optimiser with learning rate 0.5
    optimiser = Descent(0.5)

    # These lines all handle the callback which prints the loss
    ctr = 0    
    function callback()
        if ctr % 128 == 0 # controls the frequency of printing the loss
            println("Loss: $(sum([loss(x,y) for (x,y) in data]))")
        end
        ctr += 1
    end
    # Done with callback
    println("Starting training.")
    
    #Flux.train!(loss, params(model), train_data, optimiser)
	Flux.@epochs numiters Flux.train!(loss, Flux.params(model), data, optimiser, cb=callback)

    return model
end

nn_multiclass_classification_more (generic function with 1 method)

In [12]:
model_2 = nn_multiclass_classification_more(X_train, oh_y_train, 7)

Starting training.
Loss: 162.0292399847118
Loss: 143.07743889834362
Loss: 150.70040312492304
Loss: 144.90206905109062
Loss: 137.18864256833055
Loss: 122.65846649747076
Loss: 132.74232749815883
Loss: 119.25879254511878
Loss: 118.91526613088192
Loss: 119.1094269726816
Loss: 118.94332360627754
Loss: 118.31977822896822
Loss: 118.92100836999421
Loss: 115.24382687183659
Loss: 118.1918881884068
Loss: 118.18301707707121
Loss: 118.17568720758449
Loss: 118.17230253049365
Loss: 118.16940328212686
Loss: 118.16844068731885
Loss: 118.16594125105667
Loss: 118.16416670997579
Loss: 118.16311671133131
Loss: 118.1606110032803
Loss: 118.1596203205116
Loss: 118.15883988861056
Loss: 118.15120963715523
Loss: 118.15137404127078
Loss: 118.1630017456022


┌ Info: Epoch 1
└ @ Main C:\Users\surface\.julia\packages\Flux\6Q5r4\src\optimise\train.jl:154
┌ Info: Epoch 2
└ @ Main C:\Users\surface\.julia\packages\Flux\6Q5r4\src\optimise\train.jl:154
┌ Info: Epoch 3
└ @ Main C:\Users\surface\.julia\packages\Flux\6Q5r4\src\optimise\train.jl:154
┌ Info: Epoch 4
└ @ Main C:\Users\surface\.julia\packages\Flux\6Q5r4\src\optimise\train.jl:154
┌ Info: Epoch 5
└ @ Main C:\Users\surface\.julia\packages\Flux\6Q5r4\src\optimise\train.jl:154
┌ Info: Epoch 6
└ @ Main C:\Users\surface\.julia\packages\Flux\6Q5r4\src\optimise\train.jl:154
┌ Info: Epoch 7
└ @ Main C:\Users\surface\.julia\packages\Flux\6Q5r4\src\optimise\train.jl:154
┌ Info: Epoch 8
└ @ Main C:\Users\surface\.julia\packages\Flux\6Q5r4\src\optimise\train.jl:154
┌ Info: Epoch 9
└ @ Main C:\Users\surface\.julia\packages\Flux\6Q5r4\src\optimise\train.jl:154
┌ Info: Epoch 10
└ @ Main C:\Users\surface\.julia\packages\Flux\6Q5r4\src\optimise\train.jl:154
┌ Info: Epoch 11
└ @ Main C:\Users\surface\.julia

Chain(
  Dense(16 => 32, relu),                [90m# 544 parameters[39m
  Dense(32 => 32, relu),                [90m# 1_056 parameters[39m
  Dense(32 => 7, σ),                    [90m# 231 parameters[39m
) [90m                  # Total: 6 arrays, [39m1_831 parameters, 7.527 KiB.

In [13]:
cm_train_2 = confusionMatrix(predictall(model_2,X_train), y_train)
cm_eval_2 = confusionMatrix(predictall(model_2,X_eval), y_eval)

println(accuracy(cm_train_2, y_train))
println(accuracy(cm_eval_2, y_eval))

0.8555555555555555
0.7272727272727273


In [19]:
# I added a hidden layer to the neural network: Dense(2*d, 2*d, relu).
# The training accuracy gets worse, but the test accuracy stays the same.
# This might be because we have too many parameters and not enough data,
# So the model cannot learn as well.

In [14]:
function nn_multiclass_classification_less(X, Y, n_classes; numiters=40)

    d = size(X,2); m = size(Y,2)

    # Feel free to play with this model, add layers, change layer size
    # change activation, etc.
    model = Chain(
        Dense(d, n_classes, sigmoid))
    # The model outputs n_classes probabilities. We will choose the highest as our classification

    data = zip(eachrow(X), eachrow(Y))
	
    # logitcrossentropy is used for training classification models
    loss(x, y) = logitcrossentropy(model(x), y)
	
    # Training
    # Gradient descent optimiser with learning rate 0.5
    optimiser = Descent(0.5)

    # These lines all handle the callback which prints the loss
    ctr = 0    
    function callback()
        if ctr % 128 == 0 # controls the frequency of printing the loss
            println("Loss: $(sum([loss(x,y) for (x,y) in data]))")
        end
        ctr += 1
    end
    # Done with callback
    println("Starting training.")
    
    #Flux.train!(loss, params(model), train_data, optimiser)
	Flux.@epochs numiters Flux.train!(loss, Flux.params(model), data, optimiser, cb=callback)

    return model
end

nn_multiclass_classification_less (generic function with 1 method)

In [15]:
model_3 = nn_multiclass_classification_less(X_train, oh_y_train, 7)

Starting training.
Loss: 168.99467729630896
Loss: 129.9828274417174
Loss: 123.94971038792046
Loss: 121.35492404357876
Loss: 121.07311396076857
Loss: 119.42729731934124
Loss: 118.4225577849057
Loss: 118.36988161533715
Loss: 117.64579090361329
Loss: 117.80650562338886
Loss: 117.15258458267427
Loss: 116.66122687439197
Loss: 116.63354367403883
Loss: 116.34522575589921
Loss: 116.33707216037855
Loss: 115.99467273393634
Loss: 116.34059431065242
Loss: 115.82786614479862
Loss: 115.62444630517587
Loss: 115.59731614546223
Loss: 115.57808958593718
Loss: 115.69612621721606
Loss: 115.3033128538939
Loss: 115.56367257356777
Loss: 115.31996061399127
Loss: 115.31957185320793
Loss: 115.24072256896909
Loss: 115.13502005566284
Loss: 115.21690500634419


┌ Info: Epoch 1
└ @ Main C:\Users\surface\.julia\packages\Flux\6Q5r4\src\optimise\train.jl:154
┌ Info: Epoch 2
└ @ Main C:\Users\surface\.julia\packages\Flux\6Q5r4\src\optimise\train.jl:154
┌ Info: Epoch 3
└ @ Main C:\Users\surface\.julia\packages\Flux\6Q5r4\src\optimise\train.jl:154
┌ Info: Epoch 4
└ @ Main C:\Users\surface\.julia\packages\Flux\6Q5r4\src\optimise\train.jl:154
┌ Info: Epoch 5
└ @ Main C:\Users\surface\.julia\packages\Flux\6Q5r4\src\optimise\train.jl:154
┌ Info: Epoch 6
└ @ Main C:\Users\surface\.julia\packages\Flux\6Q5r4\src\optimise\train.jl:154
┌ Info: Epoch 7
└ @ Main C:\Users\surface\.julia\packages\Flux\6Q5r4\src\optimise\train.jl:154
┌ Info: Epoch 8
└ @ Main C:\Users\surface\.julia\packages\Flux\6Q5r4\src\optimise\train.jl:154
┌ Info: Epoch 9
└ @ Main C:\Users\surface\.julia\packages\Flux\6Q5r4\src\optimise\train.jl:154
┌ Info: Epoch 10
└ @ Main C:\Users\surface\.julia\packages\Flux\6Q5r4\src\optimise\train.jl:154
┌ Info: Epoch 11
└ @ Main C:\Users\surface\.julia

Chain(
  Dense(16 => 7, σ),                    [90m# 119 parameters[39m
) 

In [16]:
cm_train_3 = confusionMatrix(predictall(model_3,X_train), y_train)
cm_eval_3 = confusionMatrix(predictall(model_3,X_eval), y_eval)

println(accuracy(cm_train_3, y_train))
println(accuracy(cm_eval_3, y_eval))

0.8777777777777778
0.7272727272727273


In [18]:
# I removed a layer from the neural network: Dense(d, 2*d, relu).
# I rewired the remaining layer to the input: Dense(d, n_classes, sigmoid).
# The training accuracy gets worse, but the test accuracy stays the same.
# This might be because we have too few layers and parameters,
# So the model cannot learn the complex features.