# Символьная модель языка RNN
(c) Deniz Yuret, 2019. Based on http://karpathy.github.io/2015/05/21/rnn-effectiveness.

* Задачи: научиться определять и обучать модель языка на основе символов и генерировать из нее текст. Мини-пакеты блоков текста. Сохраняйте постоянное состояние RNN между обновлениями. Обучите шекспировский генератор и программиста Джулии, используя ту же модель.
* Новые функции:
[converge](http://denizyuret.github.io/Knet.jl/latest/reference/#Knet.converge)

In [1]:
# Set display width, load packages, import symbols
ENV["COLUMNS"]=72
using Pkg; haskey(Pkg.installed(),"Knet") || Pkg.add("Knet")
using Statistics: mean
using Base.Iterators: cycle
using Knet: Knet, AutoGrad, Data, param, param0, mat, RNN, dropout, value, nll, adam, minibatch, progress!, converge

## Определяем модель

In [2]:
struct Embed; w; end

Embed(vocab::Int,embed::Int)=Embed(param(embed,vocab))

(e::Embed)(x) = e.w[:,x]  # (B,T)->(X,B,T)->rnn->(H,B,T)

In [3]:
struct Linear; w; b; end

Linear(input::Int, output::Int)=Linear(param(output,input), param0(output))

(l::Linear)(x) = l.w * mat(x,dims=1) .+ l.b  # (H,B,T)->(H,B*T)->(V,B*T)

In [4]:
# Давайте определим цепочку слоев
struct Chain
    layers
    Chain(layers...) = new(layers)
end
(c::Chain)(x) = (for l in c.layers; x = l(x); end; x)
(c::Chain)(x,y) = nll(c(x),y)
(c::Chain)(d::Data) = mean(c(x,y) for (x,y) in d)

In [5]:
# Параметры h = 0, c = 0 для RNN обеспечивают постоянное состояние между итерациями
CharLM(vocab::Int,embed::Int,hidden::Int; o...) = 
    Chain(Embed(vocab,embed), RNN(embed,hidden;h=0,c=0,o...), Linear(hidden,vocab))

CharLM (generic function with 1 method)

## Тренировка и тестирование

In [6]:
# For running experiments
function trainresults(file,model,chars)
    if (print("Train from scratch? "); readline()[1]=='y')
        a = adam(model,cycle(dtrn))
        b = (exp(model(dtst)) for _ in every(100,a))
        c = converge(b, alpha=0.1)
        progress!(c, alpha=1)
        Knet.save(file,"model",model,"chars",chars)
    else
        isfile(file) || download("http://people.csail.mit.edu/deniz/models/tutorial/$file",file)
        model,chars = Knet.load(file,"model","chars")
    end
    Knet.gc() # To save gpu memory
    return model,chars
end

every(n,itr) = (x for (i,x) in enumerate(itr) if i%n == 0);

In [7]:
# Для генерации текста из обученных моделей
function generate(model,chars,n)
    function sample(y)
        p = Array(exp.(y)); r = rand()*sum(p)
        for j=1:length(p); (r -= p[j]) < 0 && return j; end
    end
    x = 1
    reset!(model)
    for i=1:n
        y = model([x])
        x = sample(y)
        print(chars[x])
    end
    println()
end

reset!(m::Chain)=(for r in m.layers; r isa RNN && (r.c=r.h=0); end);

## Полное собрание сочинений Уильяма Шекспира

In [8]:
RNNTYPE = :lstm
BATCHSIZE = 256
SEQLENGTH = 100
VOCABSIZE = 84
INPUTSIZE = 168
HIDDENSIZE = 334
NUMLAYERS = 1;

In [9]:
# Загрузить 'Полное собрание сочинений Уильяма Шекспира'
include(Knet.dir("data","gutenberg.jl"))
trn,tst,shakechars = shakespeare()
map(summary,(trn,tst,shakechars))

("4934845-element Array{UInt8,1}", "526731-element Array{UInt8,1}", "84-element Array{Char,1}")

In [10]:
# Распечатать образец
println(string(shakechars[trn[1020:1210]]...))


    Cheated of feature by dissembling nature,
    Deform'd, unfinish'd, sent before my time
    Into this breathing world scarce half made up,
    And that so lamely and unfashionable
 


In [11]:
# Данные мини-партии
function mb(a)
    N = length(a) ÷ BATCHSIZE
    x = reshape(a[1:N*BATCHSIZE],N,BATCHSIZE)' # преобразовать полные данные в (B, N) с непрерывными строками
    minibatch(x[:,1:N-1], x[:,2:N], SEQLENGTH) # разбить на (B, T) блоков
end
dtrn,dtst = mb.((trn,tst))
length.((dtrn,dtst))

(192, 20)

In [12]:
summary.(first(dtrn))  # каждый x и y имеют размеры (BATCHSIZE, SEQLENGTH)

("256×100 Array{UInt8,2}", "256×100 Array{UInt8,2}")

In [13]:
# 3.30e+00  ┣   /       /       /       /       /    ┫ 122 [04:46, 2.35s/i]
Knet.gc()
shakemodel = CharLM(VOCABSIZE, INPUTSIZE, HIDDENSIZE; rnnType=RNNTYPE, numLayers=NUMLAYERS)
shakemodel,shakechars = trainresults("shakespeare113.jld2", shakemodel, shakechars);

Train from scratch? stdin> n


In [14]:
exp(shakemodel(dtst))  # Perplexity = 3.30

3.2993853f0

In [15]:
generate(shakemodel,shakechars,1000)

Floudg,  
  Kent. My lord, of more, youth away, his gracious
     forgot rules over a gentlewisold, how thou reads,
     The business, Romeo, Hastings, the field
  GENTLOW, MACBETH, with the TRIER


Flourish coults
  DUKE, SILVIA, and, Monten

Enter CLOWN and CLARENCE of OARSMA

  FLUELLEN. He is good, let your sooving themselves shin excelsions
    banished you are not acquainting the now. Yet she comes
    that doth make defeat of mine are that he makes an oath,
    there is a connnivation dospish from these hands, upon the
    coverworth as certain physice.
  PISTOL. T' never soft as come dead.
    Well, are the god sitting odds foo my business?'
    The bloody king, this fast show rank and runk.
  PERDIATA. Well, let's awly ladies.'
  WIDOW. Take your hands, you are poison nor the valiant man
    proph on matches sent out 'Hang.
  CELIA. They are the new-my woman's character; I am both about the
    friend of the duncatar; I, a drum in the searchine!  
  HO


## Julia программист

In [16]:
RNNTYPE = :lstm
BATCHSIZE = 64
SEQLENGTH = 64
INPUTSIZE = 512
VOCABSIZE = 128
HIDDENSIZE = 512
NUMLAYERS = 2;

In [17]:
# Прочитайте исходный код базовой библиотеки julia
base = joinpath(Sys.BINDIR, Base.DATAROOTDIR, "julia")
text = ""
for (root,dirs,files) in walkdir(base)
    for f in files
        f[end-2:end] == ".jl" || continue
        text *= read(joinpath(root,f), String)
    end
    # println((root,length(files),all(f->contains(f,".jl"),files)))
end
length(text)

9168446

In [18]:
# Найти уникальные символы, отсортировать по частоте, назначить целые идентификаторы.
charcnt = Dict{Char,Int}()
for c in text; charcnt[c]=1+get(charcnt,c,0); end
juliachars = sort(collect(keys(charcnt)), by=(x->charcnt[x]), rev=true)
charid = Dict{Char,Int}()
for i=1:length(juliachars); charid[juliachars[i]]=i; end
hcat(juliachars, map(c->charcnt[c],juliachars))

3642×2 Array{Any,2}:
 ' '   1981523
 'e'    550283
 't'    479801
 'n'    344556
 'r'    339428
 'i'    330713
 's'    327190
 'a'    317875
 'o'    277083
 '\n'   266644
 'l'    204455
 ','    201044
 ')'    194869
 ⋮            
 'ה'         1
 '🍢'         1
 '𝗾'         1
 '𝔔'         1
 'É'         1
 '𝓟'         1
 '𝚿'         1
 '𝕨'         1
 'ɛ'         1
 'Χ'         1
 '🕙'         1
 'ℚ'         1

In [19]:
# Оставить только VOCABSIZE наиболее часто встречающихся символов, разбить на тренировочные и тестовые
data = map(c->charid[c], collect(text))
data[data .> VOCABSIZE] .= VOCABSIZE
ntst = 1<<19
tst = data[1:ntst]
trn = data[1+ntst:end]
length.((data,trn,tst))

(9168446, 8644158, 524288)

In [20]:
# Распечатать образец
r = rand(1:(length(trn)-1000))
println(string(juliachars[trn[r:r+1000]]...)) 

{}, Union{}) === Const(true) # any result is ok
    @test subtype_tfunc(Union{}, Type{typeof(Union{})}) === Const(true) # any result is ok
    @test subtype_tfunc(Union{}, Const(typeof(Union{}))) === Const(true) # any result is ok
    @test subtype_tfunc(typeof(Union{}), Const(typeof(Union{}))) === Const(true) # Union{} <: typeof(Union{})
    @test subtype_tfunc(typeof(Union{}), Const(Int)) === Const(true) # Union{} <: Int
    @test subtype_tfunc(typeof(Union{}), Const(Union{})) === Const(true) # Union{} <: Union{}
    @test subtype_tfunc(typeof(Union{}), Type{typeof(Union{})}) === Const(true) # Union{} <: Union{}
    @test subtype_tfunc(typeof(Union{}), Type{typeof(Union{})}) === Const(true) # Union{} <: typeof(Union{})
    @test subtype_tfunc(typeof(Union{}), Type{Union{}}) === Const(true) # Union{} <: Union{}
    @test subtype_tfunc(Type{Union{}}, typeof(Union{})) === Const(true) # Union{} <: Union{}
    @test subtype_tfunc(Type{Union{}}, Const(typeof(Union{}))) === Const(true) # Un

In [21]:
# Minibatch data
function mb(a)
    N = length(a) ÷ BATCHSIZE
    x = reshape(a[1:N*BATCHSIZE],N,BATCHSIZE)' # преобразовать полные данные в (B, N) с непрерывными строками
    minibatch(x[:,1:N-1], x[:,2:N], SEQLENGTH) # разбить на (B, T) блоков 
end
dtrn,dtst = mb.((trn,tst))
length.((dtrn,dtst))

(2110, 127)

In [22]:
summary.(first(dtrn))  # каждый х и у имеют размеры (BATCHSIZE,SEQLENGTH)

("64×64 Array{Int64,2}", "64×64 Array{Int64,2}")

In [23]:
# 3.25e+00  ┣       /       /       /       /       /┫ 126 [05:43, 2.72s/i]
juliamodel = CharLM(VOCABSIZE, INPUTSIZE, HIDDENSIZE; rnnType=RNNTYPE, numLayers=NUMLAYERS)
juliamodel,juliachars = trainresults("juliacharlm113.jld2", juliamodel, juliachars);

Train from scratch? stdin> n


In [24]:
exp(juliamodel(dtst))  # Perplexity = 3.27

3.27486f0

In [25]:
generate(juliamodel,juliachars,1000)

                 # optional
    _ = Expr(expMreadcos, Expr(:meta, :stderr), :n, :default, ex, context[o.e.ex, ex.args[1] -typeinfo + Int])

    isprint((v"), GotoNode(e))
end

for (fname, getfield) do t
        print(io, ":")
        new()
    end
end

if option
    quote
        bounds end
    end
    @sprintf("Other prompt", ex.field, UV_REQ) == pop!(bb_start_off+1, i)
    write(io, take!(builder_path))
end

Base.:Table(io::IOContext) = write(io, position(s))

function const_rerror(pre::GlobalRef)
    ret = proty(d)
    if !rel_key && length(blk)
        return htstarted_keys(terminal(u, p))
    end
    write(io, ("\\\\\" => "\n\n\n\n") ? "<username>\n>\n"
    p = empty(dir+stdout)
    n = MD(count_ok_new_data(L) : n_power
    while push!(blks[$ur], altbuf)
    end
    function prec_uninitual(p, keep='\n')
        print(io, "1 2")
    else
        p = blk + p0
        out = Mair(1)
    elseif occursin(".cmd", keep=ks) != 0
        res = write(io, c)
    end
    while take!(word)
    