In [44]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Funnynet-Three character model

Special thanks to taivop for providing the [dataset](https://github.com/taivop/joke-dataset).

This notebook is heavily inspired by [fastai NLP work](https://github.com/fastai/fastai/blob/master/courses/dl2/imdb.ipynb).

This is the initial experiment in which we build a model which considers the previous three characters at a time to predict the next.

In [45]:
import pdb
import json
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import math, random
import preprocessing

Run the first line in the cell below to create the data set for this notebook the first time.

After, run the second line in the cell below to reload the data for this notebook. 

In [46]:
#idx, char_indices, indices_char, chars, vocab_size = preprocessing.save_data_to_pickle("data/800000char.pickle", 800000)
idx, char_indices, indices_char, chars, vocab_size = preprocessing.load_data("data/800000char.pickle")

In [47]:
cs = 3
c1_data = c2_data = c3_data = c4_data = []
for i in range(0, len(idx)-cs, cs):
    c1_data.append(idx[i])
    c2_data.append(idx[i+1])
    c3_data.append(idx[i+2])
    c4_data.append(idx[i+3])

### Create the input and outputs of our RNN

In [48]:
x1 = np.stack(c1_data)
x2 = np.stack(c2_data)
x3 = np.stack(c3_data)
y  = np.stack(c4_data)

### Create and train model

In [49]:
n_hidden = 256
embeddings_sz = 42 # size of embeddings matrix

In [50]:
# These libraries require some setup, try the pip install git+https.github.com/... trick
from fastai.io import *
from fastai.conv_learner import *

from fastai.column_data import *

In [51]:
class ThreeCharRNN(nn.Module):
    def __init__(self, vocab_size, embeddings_sz):
        super().__init__()
        self.e = nn.Embedding(vocab_size, embeddings_sz)
        self.l_in = nn.Linear(embeddings_sz, n_hidden)
        self.l_hidden = nn.Linear(n_hidden, n_hidden)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        
    def forward(self, c1, c2, c3):
        in1 = F.relu(self.l_in(self.e(c1))) # Why relu?
        in2 = F.relu(self.l_in(self.e(c2)))
        in3 = F.relu(self.l_in(self.e(c3)))
        
        h = V(torch.zeros(in1.size()).cuda())
        h = F.tanh(self.l_hidden(h+in1))
        h = F.tanh(self.l_hidden(h+in2))
        h = F.tanh(self.l_hidden(h+in3))
        
        return F.log_softmax(self.l_out(h))

In [52]:
md = ColumnarModelData.from_arrays('.', [-1], np.stack([x1,x2,x3], axis=1), y, bs=512)

In [53]:
model = ThreeCharRNN(vocab_size, embeddings_sz).cuda()

In [54]:
train_iterator = iter(md.trn_dl)
*xs,yt = next(train_iterator)
t = model(*V(xs))

In [55]:
optimizer = optim.Adam(model.parameters(), lr=1e-2)

In [57]:
fit(model, md, n_epochs=1, opt=optimizer, crit=F.nll_loss) # The negative log likelihood loss

epoch      trn_loss   val_loss                                    
    0      0.00392    0.000284  



[array([0.00028])]

In [58]:
def get_next(input):
   running_indicies = []
   indicies = []
   for char in input:
       running_indicies.append(char_indices[char])
   for i in range(10):
       indicies = np.array(running_indicies[-3:])
       indicies = T(indicies)
       prediction = model(*VV(indicies))
       pred_idx = np.argmax(to_np(prediction))
       running_indicies.append(pred_idx)
   result_chars = []
   for index in running_indicies:
       result_chars.append(chars[index])
   return result_chars

In [59]:
get_next('y. ')

['y', '.', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']

In [60]:
get_next('the')

['t', 'h', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e']

In [61]:
get_next('blo')

['b', 'l', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o', 'o']

It appears in most (all?) cases it just guesses the last character of the input repeatedly.