In [105]:
using PaddedViews
using LinearAlgebra
using TimerOutputs
using BenchmarkTools
using Profile
to = TimerOutput();
tt = TimerOutput();


In [106]:
abstract type Node end
abstract type Operator <: Node end

mutable struct Variable{N} <: Node
    name::String
    output::Array{Float64, N}
    gradient::Array{Float64, N}
    v₁::Array{Float64, N}
    v₂::Array{Float64, N}
    Variable(N, output; name = "?") = new{N}(name, output, zeros(size(output)), zeros(size(output)), zeros(size(output)))
end

mutable struct NodeOperator{F, N} <: Operator
    name::String
    inputs::Vector{Node}
    output::Array{Float64, N}
    gradient::Array{Float64, N}
    NodeOperator(fun, inputs...; name = "?", shape=(1,1,1)) = new{typeof(fun), length(shape)}(name, [inputs...], zeros(shape), zeros(shape))
end

mutable struct DenseOperator{F, N} <: Operator
    name::String
    inputs::Vector{Node}
    output::Array{Float64, N}
    gradient::Array{Float64, N}
    DenseOperator(fun, inputs...; name = "?", shape=(1,1,1)) = new{typeof(fun), length(shape)}(name, [inputs...], zeros(shape), zeros(shape))
end

mutable struct CrossEntropyOperator{F, N} <: Operator
    name::String
    inputs::Vector{Node}
    output::Array{Float64, N}
    gradient::Array{Float64, N}
    CrossEntropyOperator(fun, inputs...; name = "?", shape=(1,1,1)) = new{typeof(fun), length(shape)}(name, [inputs...], zeros(shape), zeros(shape))
end

mutable struct SoftMaxOperator{F, N} <: Operator
    name::String
    inputs::Vector{Node}
    output::Array{Float64, N}
    gradient::Array{Float64, N}
    J::Array{Float64, 2}
    SoftMaxOperator(fun, inputs...; name = "?", shape=(1,1,1)) = new{typeof(fun), length(shape)}(name, [inputs...], zeros(shape), zeros(shape),zeros(shape,shape))
end

mutable struct RNNOperator{F, N} <: Operator
    name::String
    h::Array{Float64,2}
    inputs::Vector{Node}
    output::Array{Float64, N}
    gradient::Array{Float64, N}
    gradienth::Array{Float64, 2}
    gradientW::Array{Float64, 2}
    gradientU::Array{Float64, 2}
    gradientB::Array{Float64, 1}
    RNNOperator(fun, h, inputs...; name = "?", shape=(1,1,1)) = new{typeof(fun), length(shape)}(name,h, [inputs...], zeros(shape), zeros(shape),zeros(size(h)[1],layerNumber),zeros(size(inputs[2].output)),zeros(size(inputs[3].output)),zeros(size(inputs[4].output)))
end

In [107]:

function visit(node::Node, visited::Set, order::Vector)
    if node ∉ visited
        push!(visited, node)
        push!(order, node)
    end
end

function visit(node::Operator, visited::Set, order::Vector)
    if node ∉ visited
        for input in node.inputs
            visit(input, visited, order)
        end
        push!(visited, node)
        push!(order, node)
    end
end


function create_graph(root::Node)
    visited = Set{Node}()
    order = Vector{Node}()
    visit(root, visited, order)
    return order
end

create_graph (generic function with 1 method)

In [108]:
import Base: show, summary
show(io::IO, x::NodeOperator{F}) where {F} = print(io, "op ", "(", F, ")");
show(io::IO, x::RNNOperator{F}) where {F} = print(io, "op ", "(", F, ")");
show(io::IO, x::SoftMaxOperator{F}) where {F} = print(io, "op ", "(", F, ")");
show(io::IO, x::CrossEntropyOperator{F}) where {F} = print(io, "op ", "(", F, ")");
show(io::IO, x::DenseOperator{F}) where {F} = print(io, "op ", "(", F, ")");
show(io::IO, x::Variable) = begin
    print(io, "var ", x.name);
    print(io, "\n ┣━ ^ "); summary(io, x.output)
    print(io, "\n ┗━ ∇ ");  summary(io, x.gradient)
end

show (generic function with 605 methods)

In [109]:
zero_gradient!(node::Node) = fill!(node.gradient, 0)
function zero_gradient!(order::Vector{Node})
    for node in order
        zero_gradient!(node)
    end
end

zero_gradient! (generic function with 2 methods)

In [110]:
struct RNNParams
    W::Variable{2}
    U::Variable{2}
    b::Variable{1}
end

struct DenseParams
    weights::Variable{2}
    bias::Variable{1}
end
struct NetworkParams
    rnn::RNNParams
    dense::DenseParams
end

In [111]:
recurent_layer(x::Node, h, W::Node,U::Node, b::Node, shape) = RNNOperator(recurent_layer,h,name="rnn",shape=shape,x,W,U,b)
@inbounds @views @timeit to "recurent forward" forward(node::RNNOperator{typeof(recurent_layer)}, x, W, U, b) = let
   fill!(node.h[:,1],0)
   mul!(node.gradientB,W, node.h[:,1])
   mul!(node.gradient, U,  x[1:recsize])
      node.h[:,1] .= tanh.(node.gradientB.+node.gradient.+b)   
       for t in range(2,layerNumber)
        mul!(node.gradientB,W, node.h[:,t-1])
       mul!(node.gradient, U,  x[(t.-1).*(recsize).+1:(t).*(recsize)])
       node.h[:,t] .= tanh.(node.gradientB.+node.gradient.+b)   
    end
    return node.h[:,layerNumber]
end

@inbounds @views @timeit to "recurent backward" backward(node::RNNOperator{typeof(recurent_layer)}, x::Array{Float64,1}, W::Array{Float64,2}, U::Array{Float64,2}, b::Array{Float64,1}, g::Array{Float64,1}) = let
   
   node.gradienth[:,layerNumber] .= node.gradient
   t = layerNumber-1
    while t >=1
     mul!(node.gradienth[:,t],W', node.gradienth[:,t.+1])
     node.gradienth[:,t] .= node.gradienth[:,t] .* (1 .-  node.h[:,t+1].^2)
    t-=1
 end
 t = layerNumber
  while t>=1
    @views xc = x[(t-1).*(recsize).+1:(t).*(recsize)]
    node.gradientB .=  (1 .- node.h[:,t].^2) .* node.gradienth[:,t];
     mul!(node.gradientU,node.gradientB , xc')
      node.inputs[3].gradient .+=  node.gradientU
    if t !=1
        mul!(node.gradientW , node.gradientB , node.h[:,t-1]')
        node.inputs[2].gradient .+= node.gradientW
    end
    t-=1
 end
    node.inputs[4].gradient .= node.gradientB
end 




backward (generic function with 4 methods)

In [112]:
dense_layer(x::Node, w::Node, b::Node, shape) = DenseOperator(dense_layer, name="dense", shape=shape, x, w, b)

@views @timeit to "dense forward" forward(node::DenseOperator{typeof(dense_layer)}, x, w, b) = let
    mul!(node.output,w,x)
    node.output .+= b
    return node.output
end

@inbounds @views @timeit to "dense backward" backward(node::DenseOperator{typeof(dense_layer)}, x, w, b, g) = let
    mul!(node.inputs[1].gradient , w' , g)
    mul!(node.inputs[2].gradient , g , x')
    node.inputs[3].gradient .= g
end

backward (generic function with 4 methods)

In [113]:
softmax(x::Node, shape) = SoftMaxOperator(softmax, name="softmax", shape=shape, x)

@views @timeit to "softmax forward" forward(node::SoftMaxOperator{typeof(softmax)}, x::Vector{Float64}) = let
     node.gradient .= exp.(x)
     return node.gradient ./ sum(node.gradient::Vector{Float64})
end

@inbounds @views @timeit to "softmax backward" backward(node::SoftMaxOperator{typeof(softmax)}, x::Vector{Float64}, g) = let
   y = node.output
   mul!(node.J, .-y, y')
   node.J[1:11:100] .+= y #prev diagind(node.J)
   mul!(node.inputs[1].gradient , node.J , g)
end

backward (generic function with 4 methods)

In [114]:
cross_entropy_loss(ŷ::Node, y::Node, shape) = CrossEntropyOperator(cross_entropy_loss, name="cross_entropy_loss", shape=shape,  ŷ, y)

@timeit to "cross_entropy_loss forward" forward(::CrossEntropyOperator{typeof(cross_entropy_loss)}, ŷ, y) = let
    return sum((ŷ.-y) .^ 2 ./ 10)
end

@timeit to "cross_entropy_loss backward" backward(node::CrossEntropyOperator{typeof(cross_entropy_loss)}, ŷ, y, g) = let
    node.inputs[1].gradient .= (ŷ.-y)./5
    node.inputs[2].gradient .=[0.0]
end
     

backward (generic function with 4 methods)

In [115]:
#zero_gradient!(node::Node) = fill!(node.gradient, 0)

compute!(node::Variable) = nothing
#compute!(node::Operator) = node.output .= forward(node, [input.output for input in node.inputs]...)
@views compute!(node::RNNOperator{typeof(recurent_layer)}) =  node.output .= forward(node, node.inputs[1].output,node.inputs[2].output,node.inputs[3].output,node.inputs[4].output)
@views compute!(node::SoftMaxOperator{typeof(softmax)}) = node.output .= forward(node, node.inputs[1].output)
@views compute!(node::DenseOperator{typeof(dense_layer)}) =  node.output .= forward(node, node.inputs[1].output,node.inputs[2].output,node.inputs[3].output)
@views compute!(node::CrossEntropyOperator{typeof(cross_entropy_loss)}) =  node.output .= forward(node, node.inputs[1].output,node.inputs[2].output)

@inbounds function forward!(order::Vector{Node})::Float64
    for node in order
        compute!(node)
    end
    
    return last(order).output[1]
end   

forward! (generic function with 1 method)

In [116]:
update!(node::Node, gradient) =  node.gradient .+= gradient


function backward!(order::Vector{Node})
   last(order).gradient = [1.0]
    
     @inbounds for node in Iterators.reverse(order)
        backward!(node)
    end
end

backward!(node::Variable) = nothing

function backward!(node::RNNOperator{typeof(recurent_layer)})
     backward(node, node.inputs[1].output,node.inputs[2].output,node.inputs[3].output,node.inputs[4].output, node.gradient)
end
function backward!(node::SoftMaxOperator{typeof(softmax)})
     backward(node, node.inputs[1].output, node.gradient)
end
function backward!(node::DenseOperator{typeof(dense_layer)})
     backward(node, node.inputs[1].output,node.inputs[2].output,node.inputs[3].output, node.gradient)
end
function backward!(node::CrossEntropyOperator{typeof(cross_entropy_loss)})
     backward(node, node.inputs[1].output,node.inputs[2].output, node.gradient)
end
#function backward!(node::Operator)
#    backward(node, [input.output for input in node.inputs]..., node.gradient)
#end


backward! (generic function with 6 methods)

In [117]:
function create_network(x::Variable{1}, y::Variable{1}, params::NetworkParams)
    x₁ = recurent_layer(x,zeros(64,layerNumber),params.rnn.W,params.rnn.U,params.rnn.b,(64))
    ŷ  = dense_layer(x₁, params.dense.weights, params.dense.bias,(10))
    loss = cross_entropy_loss(ŷ, y, (1))
    return create_graph(loss)
end

create_network (generic function with 1 method)

In [118]:
function he_weights_init(prev, shape...)
    std = sqrt(2.0./prev)
    weights = rand(Float64, shape) .*2 .-1
    return weights .* std
end

he_weights_init (generic function with 1 method)

In [119]:

mutable struct Adam
    α::Float64
    ε::Float64
    m₁::Float64
    m₂::Float64
    k::Int64
    
    Adam(α=0.001, m₁=0.9, m₂=0.999, ε=1e-8) = new(α, ε, m₁, m₂, 1)
end

In [120]:
function update_weights!(graph::Vector{Node}, M::Adam)
     for node in graph
        if (typeof(node) == Variable{1} || typeof(node) == Variable{2}) && (node.name !="x"&&node.name!="y")
            update_weights_N!(node, M)
        end
    end
    M.k += 1
end



function update_weights_N!(node::Variable{T}, M::Adam) where T
    g =  node.gradient
    v₁ =  node.v₁
    v₂ =  node.v₂
    m₁, m₂, k, α, ε = M.m₁, M.m₂, M.k, M.α, M.ε
    v₁ .= @. m₁ * v₁ .+ (1.0 .- m₁) .* g
    v₂ .= @.  m₂ * v₂ .+ (1.0 .-  m₂) .* (g .* g)
     mt₁ = m₁.^k
    mt₂ = m₂.^k
    α₁ = α .* sqrt.(1 - mt₂)/(1-mt₁)
    node.output .-= @. α₁*v₁ ./ (sqrt.(v₂) + ε)
end

update_weights_N! (generic function with 1 method)

In [121]:
@inbounds @views function validate(x, y, graph, x_data,y_data)::Float64
    correct = 0
    length = size(y_data)[2]
    for i in range(1,length)
        x.output .= x_data[:,i]
        y.output .= y_data[:,i]
        forward!(graph)
        pred = argmax(graph[8].output)
        if 1 == y_data[:,i][pred]
               correct += 1
        end
    end
    
    acc_val = correct./size(y_data)[2]
    
    return acc_val
end

validate (generic function with 1 method)

In [122]:
const recsize = 14*14

recurent = RNNParams(
    Variable(2, he_weights_init(64, 64, 64), name="W1"),
    Variable(2, he_weights_init(64, 64, recsize), name="U1"),
    Variable(1, zeros(64), name="b1")
)
dense = DenseParams(
    Variable(2, he_weights_init(64, 10, 64), name="w2"),
    Variable(1, zeros(10), name="b2")
)
networkparams = NetworkParams(recurent,dense)

NetworkParams(RNNParams(var W1
 ┣━ ^ 64×64 Matrix{Float64}
 ┗━ ∇ 64×64 Matrix{Float64}, var U1
 ┣━ ^ 64×196 Matrix{Float64}
 ┗━ ∇ 64×196 Matrix{Float64}, var b1
 ┣━ ^ 64-element Vector{Float64}
 ┗━ ∇ 64-element Vector{Float64}), DenseParams(var w2
 ┣━ ^ 10×64 Matrix{Float64}
 ┗━ ∇ 10×64 Matrix{Float64}, var b2
 ┣━ ^ 10-element Vector{Float64}
 ┗━ ∇ 10-element Vector{Float64}))

In [123]:
# Opracowane na podstawie https://minpy.readthedocs.io/en/latest/tutorial/rnn_mnist.html
using MLDatasets, Flux
train_data = MLDatasets.MNIST(split=:train)
test_data  = MLDatasets.MNIST(split=:test)

function loader(data)
    x1dim = reshape(data.features, 28 * 28, :) # reshape 28×28 pixels into a vector of pixels
    yhot  = Flux.onehotbatch(data.targets, 0:9) # make a 10×60000 OneHotMatrix
    (x1dim, yhot)
end
(x_data,y_data) = loader(train_data)
(x_test,y_test) = loader(test_data)
x::Variable{1} = Variable(1, x_data[:,1], name="x")
y::Variable{1} = Variable(1, y_data[:,1], name="y")
const layerNumber = floor(Int,size(x_data[:,1])[1]/recsize)
net = create_network(x,y,networkparams)

10-element Vector{Node}:
 var x
 ┣━ ^ 784-element Vector{Float64}
 ┗━ ∇ 784-element Vector{Float64}
 var W1
 ┣━ ^ 64×64 Matrix{Float64}
 ┗━ ∇ 64×64 Matrix{Float64}
 var U1
 ┣━ ^ 64×196 Matrix{Float64}
 ┗━ ∇ 64×196 Matrix{Float64}
 var b1
 ┣━ ^ 64-element Vector{Float64}
 ┗━ ∇ 64-element Vector{Float64}
 op (typeof(recurent_layer))
 var w2
 ┣━ ^ 10×64 Matrix{Float64}
 ┗━ ∇ 10×64 Matrix{Float64}
 var b2
 ┣━ ^ 10-element Vector{Float64}
 ┗━ ∇ 10-element Vector{Float64}
 op (typeof(dense_layer))
 var y
 ┣━ ^ 10-element Vector{Float64}
 ┗━ ∇ 10-element Vector{Float64}
 op (typeof(cross_entropy_loss))

In [124]:
function train_rnn(x_data,y_data,net,x,y)
loss::Float64 = 0.0
pred::UInt8 = 0
epochs = 5
batch_size = 100
losses = zeros(epochs)
acc = zeros(epochs)
test_acc = zeros(epochs)
correct = 0 
adam = Adam()
for epoch in 1:epochs
    loss = 0
    correct = 0
    length = size(y_data)[2]
    @time for i in range(1,length)
         x.output .= @view x_data[:,i]
         y.output .= @view y_data[:,i]
        @timeit to "forward" loss += forward!(net)
        prob::DenseOperator{typeof(dense_layer)} = net[8]
        pred = argmax(prob.output)
        if 1 == y_data[:,i][pred]
            correct += 1
        end
        @timeit to "backward" backward!(net)
        if i % batch_size == 0
        @timeit to "update weights" update_weights!(net, adam)
        zero_gradient!(net)
        end;
    end
    losses[epoch] = loss/length
    acc[epoch] = correct/length
    @timeit to "validate" test_acc[epoch] = validate(x,y,net,x_test,y_test)
    println("Epoch: ", epoch, "\tAverage loss: ", round(losses[epoch], digits=3), "\tAverage acc: ", round(acc[epoch],digits=3),"\tAverage test acc: ",round(test_acc[epoch],digits=3))
end
show(to)
#show(tt)
reset_timer!(to);
end

train_rnn (generic function with 1 method)

In [125]:
 train_rnn(x_data,y_data,net,x,y)

  6.801079 seconds (4.16 M allocations: 155.722 MiB, 0.22% gc time, 15.88% compilation time)
Epoch: 1	Average loss: 0.045	Average acc: 0.813	Average test acc: 0.889
  5.432552 seconds (2.82 M allocations: 64.114 MiB)
Epoch: 2	Average loss: 0.028	Average acc: 0.901	Average test acc: 0.912
  5.472692 seconds (2.82 M allocations: 64.114 MiB)
Epoch: 3	Average loss: 0.024	Average acc: 0.918	Average test acc: 0.918
  5.284816 seconds (2.82 M allocations: 64.114 MiB, 0.54% gc time)
Epoch: 4	Average loss: 0.023	Average acc: 0.922	Average test acc: 0.92
  5.268782 seconds (2.82 M allocations: 64.114 MiB)
Epoch: 5	Average loss: 0.021	Average acc: 0.927	Average test acc: 0.926
[0m[1m ────────────────────────────────────────────────────────────────────────────────[22m
[0m[1m                               [22m         Time                    Allocations      
                               ───────────────────────   ────────────────────────
       Tot / % measured:            31.5s /  90.6%   

[0m[1m ────────────────────────────────────────────────────────────────────[22m
[0m[1m                   [22m         Time                    Allocations      
                   ───────────────────────   ────────────────────────
 Tot / % measured:      343ms /   0.0%           20.3MiB /   0.0%    

 Section   ncalls     time    %tot     avg     alloc    %tot      avg
 ────────────────────────────────────────────────────────────────────
[0m[1m ────────────────────────────────────────────────────────────────────[22m