In [1]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Funnynet

## A neural network that makes jokes

Special thanks to taivop for providing the [dataset](https://github.com/taivop/joke-dataset).

This notebook is heavily inspired by [fastai NLP work](https://github.com/fastai/fastai/blob/master/courses/dl2/imdb.ipynb).

In [2]:
import pdb
import json
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import math, random

In [3]:
BOS = 'xbos'  # beginning-of-sentence tag
FLD = 'xfld'  # data field tag
EOJ = 'xeoj'  # end of joke tag

PATH=Path('data')

In [5]:
files = list(PATH.iterdir())
files

[PosixPath('data/models'),
 PosixPath('data/reddit_jokes.json'),
 PosixPath('data/val'),
 PosixPath('data/stupidstuff.json'),
 PosixPath('data/trn')]

In [7]:
for fname in files:
    if "reddit_jokes.json" in str(fname):
        reddit_dataset = str(fname)
    if "stupidstuff.json" in str(fname):
        stupid_dataset = str(fname)

In [8]:
reddit_jokes = json.load(open(reddit_dataset))
stupid_jokes = json.load(open(stupid_dataset))

In [9]:
len(reddit_jokes)

194553

In [10]:
reddit_jokes[0]

{'body': 'Now I have to say "Leroy can you please paint the fence?"',
 'id': '5tz52q',
 'score': 1,
 'title': 'I hate how you cant even say black paint anymore'}

In [11]:
title_plus_body = [joke['title'] + ' ' + joke['body'] for joke in reddit_jokes]
title_plus_body[0]

'I hate how you cant even say black paint anymore Now I have to say "Leroy can you please paint the fence?"'

Horrible, but formatted correctly. 

# Setup, creating .txt files to train LSTM on

Partitioning the dataset and writing it to /data/trn/trn.txt and /data/val/val.txt

In [15]:
(PATH/'val').mkdir(exist_ok=True)
(PATH/'trn').mkdir(exist_ok=True)

In [14]:
# accepted_chars = "!\"\'“”‘’(),$*-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz \n"
# print(accepted_chars)

num_jokes = len(title_plus_body)

with open(PATH/'trn'/'trn.txt', 'wb') as trn, open(PATH/'val'/'val.txt', 'wb') as val:
    for i in range(num_jokes):
        # Strip formatting
        joke = ''.join([char if ord(char) < 128 else ' ' for char in title_plus_body[i]]) + ' ' + EOJ + ' '
        if i % 5 == 0: # Save to validation set
            val.write(joke.encode('utf-8'))
        else: # Save to training set
            trn.write(joke.encode('utf-8'))

In [16]:
list(PATH.iterdir())

[PosixPath('data/models'),
 PosixPath('data/reddit_jokes.json'),
 PosixPath('data/val'),
 PosixPath('data/stupidstuff.json'),
 PosixPath('data/trn')]

In [17]:
# TODO: Refactor to use pathlib
PATH='data/'

TRN_PATH = 'trn/'
VAL_PATH = 'val/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

%ls {PATH}

[0m[01;34mmodels[0m/  reddit_jokes.json  stupidstuff.json  [01;34mtrn[0m/  [01;34mval[0m/


# Now, we have data partitioned into a train / val split, let's train the model

In [18]:
from torchtext import vocab, data

from fastai.nlp import *
from fastai.lm_rnn import *
from fastai import sgdr
n_hidden = 256
embeddings_sz = 42 # size of embeddings matrix

In [19]:
TEXT = data.Field(lower=True, tokenize=list)
bs=64; bptt=8; n_fac=42; n_hidden=256

FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=3)

len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)

(77405, 73, 1, 39632371)

`md.nt` is the vocab size here. It's the same as `len(TEXT.vocab)` under the hood. (see `LanguageModelData` source).

In [40]:
??LanguageModelData

# LSTM

In [20]:
class CharSeqStatefulLSTM(nn.Module):
    def __init__(self, vocab_size, embeddings_sz, bs, nl):
        super().__init__()
        self.vocab_size,self.nl = vocab_size,nl
        self.e = nn.Embedding(vocab_size, embeddings_sz)
        self.rnn = nn.LSTM(embeddings_sz, n_hidden, nl, dropout=0.5)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, rnn_len):
        bs = rnn_len[0].size(0)
        if self.h[0].size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(rnn_len), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs):
        self.h = (V(torch.zeros(self.nl, bs, n_hidden)),
                  V(torch.zeros(self.nl, bs, n_hidden)))

In [31]:
m = CharSeqStatefulLSTM(len(TEXT.vocab),embeddings_sz, 512, 2).cuda()
lo = LayerOptimizer(optim.Adam, m, 1e-2, 1e-5)

In [38]:
os.makedirs(f'{PATH}models', exist_ok=True)

In [39]:
fit(m, md, 2, lo.opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=2), HTML(value='')))

epoch      trn_loss   val_loss                                  
    0      1.515767   1.495247  
    1      1.513658   1.484518                                  



[array([1.48452])]

In [47]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 4, lo.opt, F.nll_loss, callbacks=cb)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                                  
    0      1.309037   1.280966  
    1      1.429574   1.416574                                  
    2      1.291161   1.263028                                  
    3      1.478619   1.472945                                  



[array([1.47295])]

In [48]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 4, lo.opt, F.nll_loss, callbacks=cb)

HBox(children=(IntProgress(value=0, description='Epoch', max=4), HTML(value='')))

epoch      trn_loss   val_loss                                  
    0      1.30442    1.275652  
    1      1.416012   1.404496                                  
    2      1.285423   1.260299                                  
    3      1.458385   1.450095                                  



[array([1.45009])]

### Test

In [41]:
def get_next(inp):
    idxs = TEXT.numericalize(inp)
    p = m(VV(idxs.transpose(0,1)))
    r = torch.multinomial(p[-1].exp(), 1)
    return TEXT.vocab.itos[to_np(r)[0]]

In [42]:
get_next('into a b')

'l'

In [43]:
get_next('a ma')

'n'

In [44]:
get_next('blon')

'd'

In [45]:
# def get_next_n(inp, n):
#     res = inp
#     for i in range(n):
#         c = get_next(inp)
#         res += c
#         inp = inp[1:]+c
#     return res

def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [46]:
print(get_next_n('into a b', 400))

into a boy calls into the people prack and slippe xounchlock bridge xeoj why does the best only publes over himsned kick and says, "chuffur, treattent, the ass so the busched exciter were so everywhe always a fither brings him. the viden. "hey give me: "find!" xeoj i'll killed, are nater.  so, if the armuns to nrot ide there looks to he water has the little but it cream harded.after a ravol-while from the


In [54]:
print(get_next_n('blon', 200))

blong, it withother, is a sec xeoj de pilg xeoj dick xeoj and nigged?* xeoj how four and one now."**wife: you took somete; it anonist, and paba and, man: i remus keepular. i can caughtyfe***"yes" 9.he run


In [55]:
print(get_next_n('what do y',400))

what do you have a bad." xeoj  what would play him with my bodyle with my penis bottle of erectued. you sound xeoj  saunce xeoj  what chef nic?"'miss as the popher has unfully gun.  xeoj  this movie- xeoj  teacher riding youx for their awe... if i'll make it chanced. xeoj  you knowing] black. after name dicks xeoj  two teacher radio one.0." xeoj  i met your brother, tomorrow xeoj  girl we need to be fourin


In [58]:
# def get_next_n_jokes(prompt, n):
#     jokes = []
#     c = ''
#     for j in range(n):
#         inp = prompt
#         res = prompt
#         while res[-5:-1]!="xeoj":
#             c = get_next(inp)
#             res += c
#             inp = inp[1:]+c
#         jokes.append(res)
#     return jokes

In [56]:
# for joke in get_next_n_jokes("you hear about the university book store worker",4):
#     print(joke)
#     print("\n")

In [57]:
# for joke in get_next_n_jokes("i hate how you can't s",4):
#     print(joke)
#     print("\n")

In [59]:
# for joke in get_next_n_jokes('why women need legs',4):
#     print(joke)
#     print("\n")