In [1]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Funnynet

## A neural network that makes jokes

Special thanks to taivop for providing the [dataset](https://github.com/taivop/joke-dataset).

This notebook is heavily inspired by [fastai NLP work](https://github.com/fastai/fastai/blob/master/courses/dl2/imdb.ipynb).

In [2]:
import pdb
import json
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import math, random

In [3]:
BOS = 'xbos'  # beginning-of-sentence tag
FLD = 'xfld'  # data field tag
EOJ = 'xeoj'  # end of joke tag

PATH=Path('data')

In [4]:
files = list(PATH.iterdir())
print(files)

[PosixPath('data/reddit_jokes.json'), PosixPath('data/stupidstuff.json')]


In [5]:
for fname in files:
    if "eddit" in str(fname):
        reddit_dataset = str(fname)
    if "upid" in str(fname):
        stupid_dataset = str(fname)
reddit_jokes = json.load(open(reddit_dataset))
stupid_jokes = json.load(open(stupid_dataset))

In [6]:
len(reddit_jokes)

194553

In [7]:
reddit_jokes[0]

{'body': 'Now I have to say "Leroy can you please paint the fence?"',
 'id': '5tz52q',
 'score': 1,
 'title': 'I hate how you cant even say black paint anymore'}

Let's discard all the jokes that have 0 score, as they aren't that helpful for training

In [8]:
rated_jokes = [joke for joke in reddit_jokes if joke['score'] > 0]

In [9]:
len(rated_jokes)

132992

In [11]:
scores = [joke['score'] for joke in rated_jokes]
np.mean(scores),np.max(scores)

(172.94791416025024, 48526)

In [13]:
# low_scores = [score for score in scores]
# plt.xscale('log', nonposx='clip')
# plt.ylim(ymax=200000)
# plt.axes.set_ylim([0,200000])
# plt.hist(low_scores, bins=100);

In [14]:
title_body = [joke['title']+' '+joke['body'] for joke in rated_jokes]
title_body[0]

'I hate how you cant even say black paint anymore Now I have to say "Leroy can you please paint the fence?"'

Horrible, but formatted correctly. Now, let's combine all the jokes into one long string, using the `EOJ` tag.

In [16]:
text = ''
for joke in title_body:
    text = text + ' ' + joke + ' ' + EOJ + ' '
    if len(text) > 2700000:
        break

In [18]:
len(text)
# print(text[1000:1100])

2700071

In [22]:
chars = sorted(list(set(text)))
vocab_size = len(chars)+1
print('total chars:', vocab_size)
chars.insert(0, "\0")
print(chars)

total chars: 164
['\x00', '\t', '\n', '\r', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|', '~', '\x9d', '\xa0', '¬¢', '¬£', '¬´', '¬∞', '¬¥', '¬ª', '√ë', '√ì', '√ó', '√†', '√§', '√®', '√©', '√´', '√≠', '√±', '√≥', ' ñ', 'Õú', 'Õ°', 'Œ£', 'Œº', 'œÄ', '\u2009', '\u200b', '\u200f', '‚Äì', '‚Äî', '‚Äò', '‚Äô', '‚Äú', '‚Äù', '‚Ä¢', '‚Ä¶', '\u2028', '‚Ä≤', '‚Ä≥', '‚ÄΩ', '‚Ç¨', '‚àÜ', '‚àö', '‚à´', '‚òù', '‚ô´', '‚ôª', '‰∏ª', '‰∫∫', 'Êñã', 'Êûó', 'ÊµÆ', 'ÁôΩ', 'Á¨ë', '\ufeff', 'üá©', 'üá∞', 'üòÅ', 'üòÇ', 'üòà', 'üòä', 'üòé', 'üòú', 'üò®', 'üò≥', 'üôÑ', 'ü§£']


It appears there are Emojis in the dataset. 

In [23]:
char_indices = {c: i for i, c in enumerate(chars)}
indices_char = {i: c for i, c in enumerate(chars)}

In [24]:
idx = [char_indices[c] for c in text]

idx[:10]

[4, 45, 4, 76, 69, 88, 73, 4, 76, 83]

In [25]:
''.join(indices_char[i] for i in idx[:70])

' I hate how you cant even say black paint anymore Now I have to say "L'

# Setup, creating .txt files to train LSTM on

Partitioning the dataset and writing it to /data/trn/trn.txt and /data/val/val.txt

In [31]:
DANIEL_PATH = Path('data/')
(DANIEL_PATH/'val').mkdir(exist_ok=True)
(DANIEL_PATH/'trn').mkdir(exist_ok=True)

In [32]:
list(DANIEL_PATH.iterdir())

[PosixPath('data/reddit_jokes.json'),
 PosixPath('data/val'),
 PosixPath('data/stupidstuff.json'),
 PosixPath('data/trn')]

In [34]:
trn = open(TRN+"trn.txt","wb")
trn.write(text[:len(text)*4//5].encode('utf-8'))#str(idx[0:int(len(idx)*2/3)]))
trn.close()

val = open(VAL+"val.txt","wb")
val.write(text[len(text)*4//5:].encode('utf-8'))#str(idx[int(len(idx)*2/3):len(idx)-1]))
val.close()

In [41]:
PATH='data/'

TRN_PATH = 'trn/'
VAL_PATH = 'val/'
TRN = f'{PATH}{TRN_PATH}'
VAL = f'{PATH}{VAL_PATH}'

%ls {PATH}

reddit_jokes.json  stupidstuff.json  [0m[01;34mtrn[0m/  [01;34mval[0m/


Looks like this next part assumes we have already partitioned the data into trn/trn.txt and val/val.txt for training and validation sets, respectively.

In [42]:
TEXT = data.Field(lower=True, tokenize=list)
print("TEXT: "+str(TEXT))
bs=64; bptt=8; n_fac=42; n_hidden=256

FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=bs, bptt=bptt, min_freq=3)

len(md.trn_dl), md.nt, len(md.trn_ds), len(md.trn_ds[0].text)

TEXT: <torchtext.data.field.Field object at 0x7f4cfc486780>


(4178, 92, 1, 2139781)

### LSTM
Now we will try an LSTM

In [39]:
from torchtext import vocab, data

from fastai.nlp import *
from fastai.lm_rnn import *
from fastai import sgdr
n_hidden = 256
embeddings_sz = 42 # size of embeddings matrix

In [37]:
class CharSeqStatefulLSTM(nn.Module):
    def __init__(self, vocab_size, embeddings_sz, bs, nl):
        super().__init__()
        self.vocab_size,self.nl = vocab_size,nl
        self.e = nn.Embedding(vocab_size, embeddings_sz)
        self.rnn = nn.LSTM(embeddings_sz, n_hidden, nl, dropout=0.5)
        self.l_out = nn.Linear(n_hidden, vocab_size)
        self.init_hidden(bs)
        
    def forward(self, rnn_len):
        bs = rnn_len[0].size(0)
        if self.h[0].size(1) != bs: self.init_hidden(bs)
        outp,h = self.rnn(self.e(rnn_len), self.h)
        self.h = repackage_var(h)
        return F.log_softmax(self.l_out(outp), dim=-1).view(-1, self.vocab_size)
    
    def init_hidden(self, bs):
        self.h = (V(torch.zeros(self.nl, bs, n_hidden)),
                  V(torch.zeros(self.nl, bs, n_hidden)))

In [40]:
m = CharSeqStatefulLSTM(len(chars)+1,embeddings_sz, 512, 2).cuda()
lo = LayerOptimizer(optim.Adam, m, 1e-2, 1e-5)

In [43]:
os.makedirs(f'{PATH}models', exist_ok=True)

In [44]:
fit(m, md, 2, lo.opt, F.nll_loss)

HBox(children=(IntProgress(value=0, description='Epoch', max=2), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      1.714574   1.664388  
    1      1.646911   1.612117                                



[array([1.61212])]

## He trains for 2^6 epochs... I'm not waiting that long

In [49]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 2**4-1, lo.opt, F.nll_loss, callbacks=cb)

HBox(children=(IntProgress(value=0, description='Epoch', max=15), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      1.521971   1.465887  
    1      1.563488   1.512391                                
    2      1.459124   1.416796                                
    3      1.582633   1.527102                                
    4      1.533194   1.487429                                
    5      1.457572   1.416012                                
    6      1.40332    1.370478                                
    7      1.558284   1.513639                                
    8      1.561605   1.522661                                
    9      1.538751   1.490585                                
    10     1.501419   1.458146                                
    11     1.469132   1.422798                                
    12     1.439545   1.389487                                
    13     1.387424   1.354124                                
    14     1.366881   1.331787                                



[array([1.33179])]

In [50]:
on_end = lambda sched, cycle: save_model(m, f'{PATH}models/cyc_{cycle}')
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]
fit(m, md, 2**6-1, lo.opt, F.nll_loss, callbacks=cb)

HBox(children=(IntProgress(value=0, description='Epoch', max=63), HTML(value='')))

epoch      trn_loss   val_loss                                
    0      1.354918   1.3269    
    1      1.352675   1.323687                                
    2      1.347158   1.320351                                
    3      1.350975   1.319187                                
    4      1.348225   1.314779                                
    5      1.346236   1.3113                                  
    6      1.33694    1.309875                                
    7      1.339971   1.311275                                
    8      1.342709   1.307846                                
    9      1.331497   1.304511                                
    10     1.325322   1.300965                                
    11     1.322134   1.298711                                
    12     1.323038   1.296354                                
    13     1.320237   1.295142                                
    14     1.326954   1.294583                                
    15     1.314131   

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



    45     1.271087   1.265574                                
    46     1.266298   1.264458                                
    47     1.264962   1.263703                                
    48     1.266599   1.263053                                
    49     1.270917   1.261857                                
    50     1.265776   1.261233                                
    51     1.263597   1.260589                                
    52     1.260333   1.259923                                
    53     1.266673   1.258903                                
    56     1.259735   1.25756                                 
 48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 1998/4178 [00:21<00:23, 94.08it/s, loss=1.24]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



    58     1.261125   1.256967                                
 18%|‚ñà‚ñä        | 747/4178 [00:08<00:36, 92.91it/s, loss=1.26]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 91%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 3791/4178 [00:42<00:04, 89.62it/s, loss=1.26]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



    61     1.258265   1.256377                                
 63%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé   | 2613/4178 [00:29<00:17, 90.01it/s, loss=1.24]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



### Test

In [51]:
def get_next(inp):
    idxs = TEXT.numericalize(inp)
    p = m(VV(idxs.transpose(0,1)))
    r = torch.multinomial(p[-1].exp(), 1)
    return TEXT.vocab.itos[to_np(r)[0]]

In [52]:
get_next('into a b')

'a'

In [53]:
get_next('a ma')

'n'

In [54]:
get_next('blon')

'g'

In [55]:
def get_next_n(inp, n):
    res = inp
    for i in range(n):
        c = get_next(inp)
        res += c
        inp = inp[1:]+c
    return res

In [56]:
print(get_next_n('into a b', 400))

into a bar  cut) xeoj  differencerokeigato cats? coffee is my best said, "jesus clear, i find my one‚Äôs grandma that goes togetherics.little perfects murdering, he throwing hubby? there is dear instruction you pape of weaking a city? *nine timestoralie. xeoj  what wat the parastan't walked into a bathroom and asked what nates the world."the side, plants from shopped out the couple of street. the greve. he 


In [None]:
print(get_next_n('blon', 200))

In [57]:
print(get_next_n('what do y',400))

what do you think how i fuck us " after god doing jack hole trump are hitting us. say, and to look down!" xeoj  what they're fadio ckas feel again and my friend for a 25, 4,52949 silent, the opportunity enough to figure of any fute. xeoj  people because for jesus. xeoj  i want to get him back. the third has walky from her man accident, none that was in his calm.last woman looking at a nip." the man is begi


In [58]:
def get_next_n_jokes(prompt, n):
    jokes = []
    c = ''
    for j in range(n):
        inp = prompt
        res = prompt
        while res[-5:-1]!="xeoj":
            c = get_next(inp)
            res += c
            inp = inp[1:]+c
        jokes.append(res)
    return jokes

In [59]:
for joke in get_next_n_jokes("you hear about the university book store worker",4):
    print(joke)
    print("\n")

IndexError: list index out of range

In [61]:
for joke in get_next_n_jokes("i hate how you can't s",4):
    print(joke)
    print("\n")

i hate how you can't stop for 7 years. he flings his dibty-out to sitting at a bell and asked the faster ""we started carrying and utteringal at a heary at this first "could you refresh?" the man comes back to his bought and scared as it's only day, "i'm walking on by because. the battle runs farted to the entire lives of arm off a body where he refugeed to be from his chair woman as heaven, but she sat up was water.one of it and he finds the ownerhip showing up. this is an old man cated i but start coasination xeoj 


i hate how you can't st.  "hey, so no, are an expensive breaked the temperature to left, he makes the lead, and the swimming rules in a back of holds. and the inside to great as and old : i just make it off her healfuses. in the wines were logged by a good shop-across and thought more really englishman, thought he heard enough trishung. she giving the hope to be fruit as she was a new debuted pretty drinking and puts it.but they can get it on easy! xeoj 


i hate how you

In [60]:
for joke in get_next_n_jokes('why women need legs',4):
    print(joke)
    print("\n")

why women need legs. xeoj 


why women need legse fishing one of his back and starts sboft ingurily.from bay. "you're everyone were cartain. xeoj 


why women need legsy arms watting the attraction of the guy gets not try. **apain skin on eyem! xeoj 


why women need legs for a few million loop, and this were holding its time. replied, ‚Äú'jae, so it's jewish tases xeoj 


