In [64]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Funnynet - 8 Character Model

Special thanks to taivop for providing the [dataset](https://github.com/taivop/joke-dataset).

This notebook is heavily inspired by [fastai NLP work](https://github.com/fastai/fastai/blob/master/courses/dl2/imdb.ipynb).

In this notebook we build a model which considers the previous eight characters at a time to predict the next.

In [65]:
import pdb
import json
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import math, random
import preprocessing as pp

# These libraries require some setup, try the pip install git+https.github.com/... trick
from fastai.io import *
from fastai.conv_learner import *

from fastai.column_data import *

In [94]:
#idx, char_indices, indices_char, chars, vocab_size = pp.save_data_to_pickle("data/800000char.pickle", 800000)
idx, char_indices, indices_char, chars, vocab_size = pp.save_data_to_pickle("data/800000char.pickle", 800000)
embeddings_sz = 42
n_hidden = 256


cs = 3
c1_data = c2_data = c3_data = c4_data = []
for i in range(0, len(idx)-cs, cs):
    c1_data.append(idx[i])
    c2_data.append(idx[i+1])
    c3_data.append(idx[i+2])
    c4_data.append(idx[i+3])
x1 = np.stack(c1_data)
x2 = np.stack(c2_data)
x3 = np.stack(c3_data)
y  = np.stack(c4_data)
md = ColumnarModelData.from_arrays('.', [-1], np.stack([x1,x2,x3], axis=1), y, bs=512)

### Let's create a bigger RNN!

In [67]:
rnn_len=8

In [68]:
# char_in_jokes = [[idx[i+j] for i in range(rnn_len)] for j in range(len(idx)-rnn_len)]

char_input = []
for j in range(len(idx)-rnn_len):
    tmp = []
    for i in range(rnn_len):
        tmp.append(idx[i+j])
    char_input.append(tmp)

In [69]:
char_output = []
for j in range(len(idx)-rnn_len):
    char_output.append(idx[j+rnn_len])

In [70]:
len(char_input)

800040

In [71]:
xs = np.stack(char_input, axis=0)

In [72]:
xs.shape

(800040, 8)

In [73]:
y = np.stack(char_output)
y.shape

(800040,)

In [74]:
xs[:rnn_len,:rnn_len]

array([[ 3, 44,  3, 75, 68, 87, 72,  3],
       [44,  3, 75, 68, 87, 72,  3, 75],
       [ 3, 75, 68, 87, 72,  3, 75, 82],
       [75, 68, 87, 72,  3, 75, 82, 90],
       [68, 87, 72,  3, 75, 82, 90,  3],
       [87, 72,  3, 75, 82, 90,  3, 92],
       [72,  3, 75, 82, 90,  3, 92, 82],
       [ 3, 75, 82, 90,  3, 92, 82, 88]])

In [75]:
len(y[:rnn_len])

8

In [76]:
val_idx = get_cv_idxs(len(idx)-rnn_len-1)
# val_idx.shape
model_data = ColumnarModelData.from_arrays('.', val_idx, xs, y, bs=512)

In [77]:
class CharLoopModel(nn.Module):
    def __init__(self, vocab_size, embeddings_sz):
        super().__init__()
        self.e = nn.Embedding(vocab_size, embeddings_sz)
        self.l_in = nn.Linear(embeddings_sz, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *rnn_len):
#         pdb.set_trace()
        bs = rnn_len[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in rnn_len:
            inp = F.relu(self.l_in(self.e(c)))
            h = F.tanh(self.l_hidden(h+inp))
            
        return F.log_softmax(self.l_out(h), dim=-1)

In [78]:
model = CharLoopModel(vocab_size, embeddings_sz).cuda()
opt = optim.Adam(model.parameters(), 1e-2)

In [80]:
fit(model, model_data, n_epochs=1, opt=opt, crit=F.nll_loss)

epoch      trn_loss   val_loss                                
    0      2.047855   2.082238  



[array([2.08224])]

In [81]:
set_lrs(opt, 0.001)

In [83]:
fit(model, model_data, n_epochs=1, opt=opt, crit=F.nll_loss)

epoch      trn_loss   val_loss                                
    0      1.796313   1.803331  



[array([1.80333])]

In [84]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = model(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [85]:
get_next('for thos')

'e'

In [86]:
get_next(' a blond')

'e'

In [87]:
get_next('into a b')

'e'

In [88]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [89]:
get_next_n('into a b', 40)

'into a been a been a been a been a been a been a'

In [90]:
class CharLoopConcatModel(nn.Module):
    def __init__(self, vocab_size, embeddings_sz):
        super().__init__()
        self.e = nn.Embedding(vocab_size, embeddings_sz)
        self.l_in = nn.Linear(embeddings_sz+n_hidden, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *rnn_len):
        bs = rnn_len[0].size(0)
        h = V(torch.zeros(bs, n_hidden).cuda())
        for c in rnn_len:
            inp = torch.cat((h, self.e(c)), 1)
            inp = F.relu(self.l_in(inp))
            h = F.tanh(self.l_hidden(inp))
        
        return F.log_softmax(self.l_out(h), dim=-1)

In [91]:
m = CharLoopConcatModel(vocab_size, embeddings_sz).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [95]:
it = iter(md.trn_dl)
*xs,yt = next(it)
t = m(*V(xs))

In [96]:
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss                                    
    0      0.000296   0.00027   



[array([0.00027])]

In [97]:
set_lrs(opt, 1e-4)

In [98]:
fit(m, md, 1, opt, F.nll_loss)

epoch      trn_loss   val_loss                                    
    0      0.000185   0.000179  



[array([0.00018])]

In [99]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [100]:
get_next('wome')

'w'

In [101]:
get_next('beca')

'b'

In [102]:
get_next('char')

'c'

### Model with Multiple Outputs

In [103]:
class CharRnn(nn.Module):
    def __init__(self, vocab_size, embeddings_sz):
        super().__init__()
        self.e = nn.Embedding(vocab_size, embeddings_sz)
        self.rnn = nn.RNN(embeddings_sz, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *rnn_len):
        #print("rnn_len: "+str(rnn_len))
        bs = rnn_len[0].size(0)
        h = V(torch.zeros(1, bs, n_hidden))
        inp = self.e(torch.stack(rnn_len))
        outp,h = self.rnn(inp, h)
        
        return F.log_softmax(self.l_out(outp[-1]), dim=-1)

In [104]:
m = CharRnn(vocab_size, embeddings_sz).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [105]:
it = iter(md.trn_dl)
*xs,yt = next(it)

In [106]:
t = m.e(V(torch.stack(xs)))
t.size()

torch.Size([3, 512, 42])

In [107]:
ht = V(torch.zeros(1, 512,n_hidden))
outp, hn = m.rnn(t, ht)
outp.size(), hn.size()

(torch.Size([3, 512, 256]), torch.Size([1, 512, 256]))

In [108]:
t = m(*V(xs)); t.size()

torch.Size([512, 125])

In [109]:
fit(m, md, 4, opt, F.nll_loss)

epoch      trn_loss   val_loss                                    
    0      0.000131   0.00029   
    1      3.3e-05    6.7e-05                                     
    2      1.8e-05    2.1e-05                                     
    3      2e-06      6e-06                                       



[array([0.00001])]

In [110]:
set_lrs(opt, 1e-4)

In [111]:
fit(m, md, 2, opt, F.nll_loss)

epoch      trn_loss   val_loss                                    
    0      1e-06      4e-06     
    1      1e-06      4e-06                                       



[array([0.])]

In [112]:
def get_next(inp):
    idxs = T(np.array([char_indices[c] for c in inp]))
    p = m(*VV(idxs))
    i = np.argmax(to_np(p))
    return chars[i]

In [113]:
get_next('for thos')

't'

In [114]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [115]:
get_next_n('for thos', 40)

'for thostot ttttttt ttttttt ttttttt ttttttt tttt'

In [116]:
c_in_dat = [[idx[i+j] for i in range(rnn_len)] for j in range(0, len(idx)-rnn_len-1, rnn_len)]

In [117]:
c_out_dat = [[idx[i+j] for i in range(rnn_len)] for j in range(1, len(idx)-rnn_len, rnn_len)]

In [118]:
xs = np.stack(c_in_dat)
xs.shape

(100005, 8)

In [119]:
xs[:rnn_len,:rnn_len]

array([[ 3, 44,  3, 75, 68, 87, 72,  3],
       [75, 82, 90,  3, 92, 82, 88,  3],
       [70, 68, 81, 87,  3, 72, 89, 72],
       [81,  3, 86, 68, 92,  3, 69, 79],
       [68, 70, 78,  3, 83, 68, 76, 81],
       [87,  3, 68, 81, 92, 80, 82, 85],
       [72,  3, 49, 82, 90,  3, 44,  3],
       [75, 68, 89, 72,  3, 87, 82,  3]])

In [120]:
ys = np.stack(c_out_dat)
ys.shape

(100005, 8)

In [121]:
ys[:rnn_len,:rnn_len]

array([[44,  3, 75, 68, 87, 72,  3, 75],
       [82, 90,  3, 92, 82, 88,  3, 70],
       [68, 81, 87,  3, 72, 89, 72, 81],
       [ 3, 86, 68, 92,  3, 69, 79, 68],
       [70, 78,  3, 83, 68, 76, 81, 87],
       [ 3, 68, 81, 92, 80, 82, 85, 72],
       [ 3, 49, 82, 90,  3, 44,  3, 75],
       [68, 89, 72,  3, 87, 82,  3, 86]])

In [122]:
val_idx = get_cv_idxs(len(xs)-rnn_len-1)

In [123]:
md = ColumnarModelData.from_arrays('.', val_idx, xs, ys, bs=512)

In [124]:
class CharSeqRnn(nn.Module):
    def __init__(self, vocab_size, embeddings_sz):
        super().__init__()
        self.e = nn.Embedding(vocab_size, embeddings_sz)
        self.rnn = nn.RNN(embeddings_sz, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, *rnn_len):
        bs = rnn_len[0].size(0)
        h = V(torch.zeros(1, bs, n_hidden))
        inp = self.e(torch.stack(rnn_len))
        outp,h = self.rnn(inp, h)
        return F.log_softmax(self.l_out(outp), dim=-1)

In [125]:
m = CharSeqRnn(vocab_size, embeddings_sz).cuda()
opt = optim.Adam(m.parameters(), 1e-3)

In [126]:
it = iter(md.trn_dl)
*xst,yt = next(it)
#t = m(*V(xs))

In [127]:
def nll_loss_seq(inp, targ):
    sl,bs,nh = inp.size()
    targ = targ.transpose(0,1).contiguous().view(-1)
    return F.nll_loss(inp.view(-1,nh), targ)

In [128]:
fit(m, md, 4, opt, nll_loss_seq)

epoch      trn_loss   val_loss                              
    0      2.543844   2.389568  
    1      2.227583   2.18239                               
    2      2.099198   2.085688                              
    3      2.024093   2.020647                              



[array([2.02065])]

In [129]:
set_lrs(opt, 1e-4)

In [130]:
fit(m, md, 1, opt, nll_loss_seq)

epoch      trn_loss   val_loss                              
    0      1.984913   2.007045  



[array([2.00704])]

In [131]:
m = CharSeqRnn(vocab_size, embeddings_sz).cuda()
opt = optim.Adam(m.parameters(), 1e-2)

In [132]:
m.rnn.weight_hh_l0.data.copy_(torch.eye(n_hidden))


    1     0     0  ...      0     0     0
    0     1     0  ...      0     0     0
    0     0     1  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      1     0     0
    0     0     0  ...      0     1     0
    0     0     0  ...      0     0     1
[torch.cuda.FloatTensor of size 256x256 (GPU 0)]

In [133]:
fit(m, md, 4, opt, nll_loss_seq)

epoch      trn_loss   val_loss                              
    0      2.270331   2.167208  
    1      2.072203   2.076485                              
    2      2.005873   2.01107                               
    3      1.967324   1.986756                              



[array([1.98676])]

In [134]:
set_lrs(opt, 1e-3)

In [135]:
fit(m, md, 4, opt, nll_loss_seq)

epoch      trn_loss   val_loss                              
    0      1.891812   1.919288  
    1      1.879213   1.913314                              
    2      1.87249    1.910392                              
    3      1.872175   1.905287                              



[array([1.90529])]