In [1]:
using Knet
using CSV
using Random
import Base: length

In [2]:
BATCHSIZE = 16
VOCABFILE = "bert-base-uncased-vocab.txt"
NUM_CLASSES = 2

2

In [3]:
token2int = Dict()
f = open(VOCABFILE) do file
    lines = readlines(file)
    for (i,line) in enumerate(lines)
        token2int[line] = i
    end
end
int2token = Dict(value => key for (key, value) in token2int)
VOCABSIZE = length(token2int)

30522

In [None]:
include("preprocess.jl")

In [5]:
# Try any text here
asd = convert_to_int_array("senseless", token2int)
[int2token[i] for i in asd]

2-element Array{String,1}:
 "sense" 
 "##less"

In [6]:
# Our minibatcher
#=
mutable struct SST
    words
    labels
    batchsize
    ninstances
    shuffled
end

function SST(words, labels; batchsize=16, shuffled=false)
    ninstances = length(words)
    return SST(words, labels, batchsize, ninstances, shuffled)
end

function length(d::SST)
    d, r = divrem(d.ninstances, d.batchsize)
    return r == 0 ? d : d+1
end

function Base.iterate(d::SST, state=ifelse(d.shuffled, randperm(d.ninstances), 1:d.ninstances))
    # START ANSWER
    state === nothing && return nothing
    if length(state) > d.batchsize
        new_state = state[d.batchsize+1:end]
        words = d.words[state[1:d.batchsize]]
        labels = d.labels[state[1:d.batchsize]]
    else
        new_state = nothing
        words = d.words[state]
        labels = d.labels[state]
    end
    sorted_words = sort(map(x -> length(x), words))
    max_len = length(words)
    batchsizes = zeros(Int64, sorted_words[end]) # init with max seq length
    for i in sorted_words
        batchsizes[1:i] .+= 1
    end
    # END ANSWER
    return ((words, labels, batchsizes), new_state)
end
=#

In [7]:
#dtrn = SST(read_and_process("mytrain.tsv", token2int)...; batchsize=BATCHSIZE, shuffled=true)
#ddev = SST(read_and_process("dev.tsv", token2int)...; batchsize=BATCHSIZE)
#dtst = SST(read_and_process("mytest.tsv", token2int)...; batchsize=BATCHSIZE)
dtrn = minibatch(read_and_process("mytrain.tsv", token2int)..., BATCHSIZE; shuffle=true) # Mytrain is a subset of train set
ddev = minibatch(read_and_process("dev.tsv", token2int)..., BATCHSIZE)
dtst = minibatch(read_and_process("mytest.tsv", token2int)..., BATCHSIZE) # Mytest is a subset of train set

Knet.Data{Tuple{Array{Array{Int64,1},1},Array{Int8,1}}}(Array{Int64,1}[[3626, 2006, 2024, 27817, 4039, 1, 1, 1, 1, 1  …  1, 1, 1, 1, 1, 1, 1, 1, 1, 1] [2098, 2425, 2038, 8563, 29625, 19764, 6835, 6364, 2191, 2986  …  1, 1, 1, 1, 1, 1, 1, 1, 1, 1] … [6570, 3239, 1011, 10042, 2595, 1011, 15704, 1011, 3083, 29625  …  1, 1, 1, 1, 1, 1, 1, 1, 1, 1] [1997, 11068, 1998, 1997, 2148, 8848, 6703, 1011, 5379, 2040  …  1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], Int8[2 1 … 1 2], 16, 2000, false, 1985, 1:2000, false, (2000,), (2000,), Array{Array{Int64,1},1}, Array{Int8,1})

In [9]:
summary.(first(dtrn))

("16-element Array{Array{Int64,1},1}", "16-element Array{Int8,1}")

## Model

In [10]:
abstract type Layer end

struct Linear <: Layer
    w
    b
    pdrop
    func
end

function Linear(xsize::Int, ysize::Int; atype=KnetArray{Float32}, winit=xavier, binit=zeros, func=identity, pdrop=0.0)
    Linear(Param(atype(winit(ysize,xsize))), Param(atype(binit(ysize))), pdrop, func)
end

function (l::Linear)(x)
    l.func.(dropout(l.w * mat(x), l.pdrop)) .+ l.b
end

struct Embed <: Layer
    w
end

Embed(vocabsize::Int,embed::Int) = Embed(param(embed,vocabsize))

function (e::Embed)(x)
    e.w[:,x]  # (B,T)->(X,B,T)->rnn->(H,B,T)
end

In [11]:
struct Chain <: Layer
    layers
end

function Chain(all_layers::Array{Layer,1})
    layers = Layer[]
    for layer in all_layers
        push!(layers, layer)
    end
    return Chain(tuple(layers...))
end

function (c::Chain)(x)
    for layer in c.layers
        x = layer(x)
    end
    return x
end

In [12]:
struct Baseline
    pdrop
    rnn::RNN
    embed::Embed
    layer::Layer
end

function Baseline(rnn_hidden::Int, embedsize::Int, vocabsize::Int, extra_layer::Layer; pdrop=0.0,  o...)
    embed = Embed(vocabsize,embedsize)
    rnn = RNN(embedsize,rnn_hidden;h=0,c=0,o...)
    return Baseline(pdrop, rnn, embed, extra_layer)
end

#=
function (b::Baseline)(x, batchsizes)
    x = b.embed.(x)
    x = b.rnn(x, batchSizes=batchsizes)
    x = b.layer(dropout(x[:,:,end], b.pdrop))
    return x
end

function (b::Baseline)(x, y::Array{Int8,1}, batchsizes)
    nll(b(x, batchsizes), y)
end

function (b::Baseline)(d::SST)
    lvals = []
    for (x, y, batchsizes) in d
        push!(lvals, b(x, y, batchsizes))
    end
    Knet.mean(lvals)
end
=#

function (b::Baseline)(x)
    x = permutedims(hcat(x...))
    x = b.embed(x)
    x = b.rnn(x)
    x = b.layer(dropout(x[:,:,end], b.pdrop))
    return x
end

function (b::Baseline)(x, y::Array{Int8,1})
    nll(b(x), y)
end

function (b::Baseline)(d::Knet.Data)
    lvals = []
    for (x, y) in d
        push!(lvals, b(x, y))
    end
    Knet.mean(lvals)
end


In [13]:
function accuracy2(model, d)
    true_count = 0
    all_count = 0
    for (x, y) in d
        true_count += sum(y' .== map(x -> x[1], argmax(Array{Float32}(model(x)),dims=1)))
        all_count += length(y)
    end
    return true_count/all_count
end

accuracy2 (generic function with 1 method)

In [13]:
model = Baseline(256, 128, VOCABSIZE, Chain([Linear(256,512, func=relu), Linear(512,NUM_CLASSES)]))
trnloss = [model(dtrn)]
devloss = [model(ddev)]
best_acc = 0.0
for epoch in 1:100
    println("Epoch : ", epoch)
    progress!(adam(model, dtrn))
    push!(trnloss, model(dtrn))
    push!(devloss, model(ddev))
    acc = accuracy(model, ddev)
    println("Accuracy : ", acc)
    if acc > best_acc
        best_acc = acc
        println("Saving...")
        Knet.save("model.jld2", "model", model)
    end
end

Epoch : 1
6.86e-01  100.00%┣██████████████████████████▉┫ 4084/4084 [00:34/00:34, 120.59i/s]
Accuracy : 0.5092592592592593
Saving...
Epoch : 2
6.97e-01  100.00%┣██████████████████████████▉┫ 4084/4084 [00:27/00:27, 151.70i/s]
Accuracy : 0.5092592592592593
Epoch : 3
7.10e-01  100.00%┣██████████████████████████▉┫ 4084/4084 [00:27/00:27, 151.42i/s]
Accuracy : 0.5092592592592593
Epoch : 4
6.58e-01  100.00%┣██████████████████████████▉┫ 4084/4084 [00:27/00:27, 152.18i/s]
Accuracy : 0.5092592592592593
Epoch : 5
6.75e-01  100.00%┣██████████████████████████▉┫ 4084/4084 [00:27/00:27, 151.80i/s]
Accuracy : 0.5092592592592593
Epoch : 6
6.85e-01  100.00%┣██████████████████████████▉┫ 4084/4084 [00:27/00:27, 151.10i/s]
Accuracy : 0.5092592592592593
Epoch : 7
7.14e-01  100.00%┣██████████████████████████▉┫ 4084/4084 [00:27/00:27, 151.84i/s]
Accuracy : 0.5092592592592593
Epoch : 8
6.60e-01  100.00%┣██████████████████████████▉┫ 4084/4084 [00:27/00:27, 151.67i/s]
Accuracy : 0.5092592592592593
Epoch : 9
6.99

In [14]:
accuracy2(model, dtst)

0.9135