1) Load reddit_jokes.json and clean it up.
All reddit jokes have a structure "title" + "body". The script combines title with body to produce a continuous string with no interruption. It is used once to clean the input dataset.

In [7]:
import json

jokes = []
with open("dataset/reddit_jokes.json", "r") as read_file:
    data = json.load(read_file)
    for i in range(len(data)):
        jokes.append({ "body": data[i]['title'] + '\n' + data[i]['body'], 'score': data[i]['score'], 'id': data[i]['id']})

with open("dataset/reddit_fixed.json", "w") as f:
        json.dump(jokes, f, indent=4, sort_keys=True)

In [8]:
print(jokes[0])

{'body': 'I hate how you cant even say black paint anymore\nNow I have to say "Leroy can you please paint the fence?"', 'score': 1, 'id': '5tz52q'}


In [9]:
print(jokes[0]['body'])

I hate how you cant even say black paint anymore
Now I have to say "Leroy can you please paint the fence?"


Simple function to load the input json and transform it into array of str

In [32]:
def load_input(filename, body_name):
    f_in = open("dataset/" + filename, "r")
    data = json.load(f_in)
    out = []
    for i in range(len(data)):
        out.append(data[i][body_name])
    return out

In [38]:
reddit_jokes = load_input("reddit_fixed.json", "body")
print(reddit_jokes[10])

Remember when you were a kid and when you cried your parents said, "I'll give you a reason to cry"?
I always thought they were gunna hit me, not that they were going to destroy the housing market 20 years later.


Clean the input and save it for later in a text file, so that each joke is on one line only

In [41]:
import string

def clean_input(data, name):
    token_data = []
    token_seqs = list()
    for i in range(len(data)):
        tokens = data[i].split()
        table = str.maketrans('', '', string.punctuation)
        tokens = [w.translate(table) for w in tokens]
        tokens = [word for word in tokens if word.isalpha()]
        tokens = [word.lower() for word in tokens]
        token_data.append(tokens)
        token_seqs.append(' '.join(tokens))
    
    filename = name + "_sequences.txt"
    out_file = open(filename, "w")
    out_file.write('\n'.join(token_seqs))
    out_file.close()
    print("Wrote clean input to " + filename)
    return token_data

In [43]:
clean = clean_input(reddit_jokes, "reddit")
print(clean[10])

Wrote clean input to reddit_sequences.txt
['remember', 'when', 'you', 'were', 'a', 'kid', 'and', 'when', 'you', 'cried', 'your', 'parents', 'said', 'ill', 'give', 'you', 'a', 'reason', 'to', 'cry', 'i', 'always', 'thought', 'they', 'were', 'gunna', 'hit', 'me', 'not', 'that', 'they', 'were', 'going', 'to', 'destroy', 'the', 'housing', 'market', 'years', 'later']


In [46]:
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

In [130]:
doc = load_doc("reddit_sequences.txt")
lines = doc.split('\n')[:100]
print(lines[10])

remember when you were a kid and when you cried your parents said ill give you a reason to cry i always thought they were gunna hit me not that they were going to destroy the housing market years later


In [68]:
from numpy import array
import numpy as np
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

In [131]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1

In [154]:
def pad_to_size(sequences):
    lens = [len(seq) for seq in sequences]
    maxlen = max(lens)
    for i in range(len(sequences)):
        zeros = np.zeros(maxlen - len(sequences[i]))
        sequences[i] = np.concatenate((sequences[i], zeros))
        sequences[i] = np.asarray(sequences[i]).astype('float32')
    return sequences

In [173]:
# separate into input and output
sequences = array(sequences)
sequences = pad_to_size(sequences)
X, y = sequences[:][:-1], sequences[:][-1]
y = to_categorical(y, num_classes=vocab_size)
X = pad_to_size(X)

In [177]:
X = np.stack(X)
y = np.stack(y)

In [178]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 50))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
X=np.asarray(X).astype(np.int32)
y=np.asarray(y).astype(np.float32)
model.fit(X, y, batch_size=128, epochs=10)

# save the model to file
model.save('model.h5')
# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))

Model: "sequential_20"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_20 (Embedding)     (None, None, 50)          63250     
_________________________________________________________________
lstm_40 (LSTM)               (None, None, 100)         60400     
_________________________________________________________________
lstm_41 (LSTM)               (None, 100)               80400     
_________________________________________________________________
dense_40 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_41 (Dense)             (None, 1265)              127765    
Total params: 341,915
Trainable params: 341,915
Non-trainable params: 0
_________________________________________________________________
None


ValueError: ignored

In [None]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
	result = list()
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# truncate sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
		# predict probabilities for each word
		yhat = model.predict_classes(encoded, verbose=0)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
		result.append(out_word)
	return ' '.join(result)
 
# load cleaned text sequences
in_filename = 'republic_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
seq_length = len(lines[0].split()) - 1
 
# load the model
model = load_model('model.h5')
 
# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))
 
# select a seed text
seed_text = lines[randint(0,len(lines))]
print(seed_text + '\n')
 
# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print(generated)