In [1]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Funnynet

## A neural network that makes jokes

Special thanks to taivop for providing the [dataset](https://github.com/taivop/joke-dataset).

This notebook is heavily inspired by [fastai NLP work](https://github.com/fastai/fastai/blob/master/courses/dl2/imdb.ipynb).

In [2]:
import pdb
import json
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import math, random

In [3]:
BOS = 'xbos'  # beginning-of-sentence tag
FLD = 'xfld'  # data field tag
EOJ = 'xeoj'  # end of joke tag

PATH=Path('data')

In [4]:
files = list(PATH.iterdir())
print(files)

[PosixPath('data/models'), PosixPath('data/reddit_jokes.json'), PosixPath('data/val'), PosixPath('data/stupidstuff.json'), PosixPath('data/trn')]


In [7]:
stupid_dataset

'data/stupidstuff.json'

In [8]:
for fname in files:
    if "reddit_jokes.json" in str(fname):
        reddit_dataset = str(fname)
    if "stupidstuff.json" in str(fname):
        stupid_dataset = str(fname)

In [9]:
reddit_jokes = json.load(open(reddit_dataset))
stupid_jokes = json.load(open(stupid_dataset))

In [11]:
len(reddit_jokes)

194553

In [12]:
reddit_jokes[0]

{'body': 'Now I have to say "Leroy can you please paint the fence?"',
 'id': '5tz52q',
 'score': 1,
 'title': 'I hate how you cant even say black paint anymore'}

Let's discard all the jokes that have 0 score, as they aren't that helpful for training

In [13]:
rated_jokes = [joke for joke in reddit_jokes if joke['score'] > 0]

In [14]:
len(rated_jokes)

132992

In [18]:
title_plus_body = [joke['title'] + ' ' + joke['body'] for joke in rated_jokes]
title_plus_body[0]
# len(title_plus_body)

'I hate how you cant even say black paint anymore Now I have to say "Leroy can you please paint the fence?"'

Horrible, but formatted correctly. Now, let's combine all the jokes into one long string, using the `EOJ` tag.

In [19]:
accepted_chars = "!\"\'“”‘’(),$*-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyz \n"
# print(accepted_chars)

text = ''
for joke in title_plus_body:
    for char in joke:
        if char not in accepted_chars:
            break
    else:
        text += ' ' + joke + ' ' + EOJ + ' '
    if len(text) > 2E6:
        break

In [21]:
len(text)
print(text[1000:1100])

hit me, not that they were going to destroy the housing market 20 years later. xeoj  My boss said to


In [22]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)
chars.insert(0, "\0")
print(chars)

total chars: 86
['\x00', '\n', ' ', '!', '"', '$', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '‘', '’', '“', '”']


In [23]:
char_indices = {c: i for i, c in enumerate(chars)}
indices_char = {i: c for i, c in enumerate(chars)}

In [24]:
idx = [char_indices[c] for c in text]

idx[:10]

[2, 35, 2, 63, 56, 75, 60, 2, 63, 70]

In [25]:
''.join(indices_char[i] for i in idx[:70])

' I hate how you cant even say black paint anymore Now I have to say "L'

# Setup, creating .txt files to train LSTM on

Partitioning the dataset and writing it to /data/trn/trn.txt and /data/val/val.txt

In [26]:
PATH

PosixPath('data')

In [27]:
(PATH/'val').mkdir(exist_ok=True)
(PATH/'trn').mkdir(exist_ok=True)

In [29]:
list(PATH.iterdir())

[PosixPath('data/models'),
 PosixPath('data/reddit_jokes.json'),
 PosixPath('data/val'),
 PosixPath('data/stupidstuff.json'),
 PosixPath('data/trn')]

In [32]:
with open(PATH/'trn'/'trn.txt', 'wb') as f:
    f.write(text[:(len(text)*4)//5].encode('utf-8'))
    
with open(PATH/'val'/'val.txt', 'wb') as f:
    f.write(text[:(len(text)*4)//5].encode('utf-8'))    

In [33]:
# TODO: Refactor to use pathlib
PATH='data/'

TRN_PATH = 'trn/'
VAL_PATH = 'val/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

%ls {PATH}

[0m[01;34mmodels[0m/  reddit_jokes.json  stupidstuff.json  [01;34mtrn[0m/  [01;34mval[0m/


# Now, we have data partitioned into a train / val split, let's train the model

In [35]:
from torchtext import vocab, data

from fastai.nlp import *
from fastai.lm_rnn import *
from fastai import sgdr
n_hidden = 256
embeddings_sz = 42 # size of embeddings matrix

In [36]:
TEXT = data.Field(lower=True, tokenize=list)
bs=64; bptt=8; n_fac=42; n_hidden=256

FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=3)

len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)

TEXT: <torchtext.data.field.Field object at 0x7f82ddd2de10>


(3097, 59, 1, 1586237)

# LSTM

In [37]:
class CharSeqStatefulLSTM(nn.Module):
    def __init__(self, vocab_size, embeddings_sz, bs, nl):
        super().__init__()
        self.vocab_size,self.nl = vocab_size,nl
        self.e = nn.Embedding(vocab_size, embeddings_sz)
        self.rnn = nn.LSTM(embeddings_sz, n_hidden, nl, dropout=0.5)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, rnn_len):
        bs = rnn_len[0].size(0)
        if self.h[0].size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(rnn_len), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs):
        self.h = (V(torch.zeros(self.nl, bs, n_hidden)),
                  V(torch.zeros(self.nl, bs, n_hidden)))

In [39]:
m = CharSeqStatefulLSTM(len(chars)+1,embeddings_sz, 512, 2).cuda()
lo = LayerOptimizer(optim.Adam, m, 1e-2, 1e-5)

In [40]:
os.makedirs(f'{PATH}models', exist_ok=True)

In [41]:
fit(m, md, 2, lo.opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=2), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      1.678402   1.633846  
    1      1.598647   1.569825                                



[array([1.56983])]

In [42]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 2**4-1, lo.opt, F.nll_loss, callbacks=cb)

HBox(children=(IntProgress(value=0, description='Epoch', max=15), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      1.45979    1.407439  
    1      1.503157   1.466166                                
    2      1.401743   1.352008                                
    3      1.534357   1.491397                                
    4      1.470248   1.4311                                  
    5      1.395231   1.353864                                
    6      1.347714   1.300119                                
    7      1.515952   1.477014                                
    8      1.504134   1.46758                                 
    9      1.479751   1.436342                                
    10     1.44414    1.4045                                  
    11     1.409773   1.366815                                
    12     1.372315   1.325256                                
    13     1.33591    1.284613                                
    14     1.305532   1.260108                                



[array([1.26011])]

In [43]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 2**6-1, lo.opt, F.nll_loss, callbacks=cb)

HBox(children=(IntProgress(value=0, description='Epoch', max=63), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      1.305987   1.255392  
    1      1.303189   1.252042                                
    2      1.303568   1.249549                                
    3      1.304508   1.248046                                
    4      1.298945   1.244094                                
    5      1.287011   1.241299                                
    6      1.29135    1.240419                                
    7      1.297407   1.240417                                
    8      1.289933   1.236835                                
    9      1.288126   1.233664                                
    10     1.282984   1.230674                                
    11     1.285248   1.228273                                
    12     1.28556    1.226379                                
    13     1.278316   1.225414                                
    14     1.279952   1.225072                                
    15     1.281672   

[array([1.17043])]

### Test

In [44]:
def get_next(inp):
    idxs = TEXT.numericalize(inp)
    p = m(VV(idxs.transpose(0,1)))
    r = torch.multinomial(p[-1].exp(), 1)
    return TEXT.vocab.itos[to_np(r)[0]]

In [45]:
get_next('into a b')

'u'

In [46]:
get_next('a ma')

'r'

In [47]:
get_next('blon')

'e'

In [50]:
# def get_next_n(inp, n):
#     res = inp
#     for i in range(n):
#         c = get_next(inp)
#         res += c
#         inp = inp[1:]+c
#     return res

def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [53]:
print(get_next_n('into a b', 400))

into a bad punch jack himself. xeoj  the words xeoj  how do you will have a best stands. we will be time! playing to people to them. xeoj  jerk gonna havmen.  xeoj  where aftey poop in the farms. amazunce jesus struggles. he proceeded gathers and says, "but, survive?"stedictle. "doctor, "mr. pengle after a sure. the wife fuck in the ended point. xeoj  i said "dad, my wife built as well and a prize and whi


In [54]:
print(get_next_n('blon', 200))

blong, it withother, is a sec xeoj de pilg xeoj dick xeoj and nigged?* xeoj how four and one now."**wife: you took somete; it anonist, and paba and, man: i remus keepular. i can caughtyfe***"yes" 9.he run


In [55]:
print(get_next_n('what do y',400))

what do you have a bad." xeoj  what would play him with my bodyle with my penis bottle of erectued. you sound xeoj  saunce xeoj  what chef nic?"'miss as the popher has unfully gun.  xeoj  this movie- xeoj  teacher riding youx for their awe... if i'll make it chanced. xeoj  you knowing] black. after name dicks xeoj  two teacher radio one.0." xeoj  i met your brother, tomorrow xeoj  girl we need to be fourin


In [58]:
# def get_next_n_jokes(prompt, n):
#     jokes = []
#     c = ''
#     for j in range(n):
#         inp = prompt
#         res = prompt
#         while res[-5:-1]!="xeoj":
#             c = get_next(inp)
#             res += c
#             inp = inp[1:]+c
#         jokes.append(res)
#     return jokes

In [56]:
# for joke in get_next_n_jokes("you hear about the university book store worker",4):
#     print(joke)
#     print("\n")

In [57]:
# for joke in get_next_n_jokes("i hate how you can't s",4):
#     print(joke)
#     print("\n")

In [59]:
# for joke in get_next_n_jokes('why women need legs',4):
#     print(joke)
#     print("\n")