In [59]:
import json
import string

def json_to_txt(json_file):
  lines = []
  with open(json_file, "r") as read_file:
    data = json.load(read_file)
    for i in range(len(data)):
        line = (data[i]['title'] + ' ' + data[i]['body']).replace("\n", " ")
        lines.append(line)
  new_file = "cleaned_" + json_file + "_.txt"
  with open(new_file, "w") as write_file:
    for line in lines:
      write_file.write(line + "\t")
    write_file.close()
  return new_file

In [60]:
import random

def line_filter(txt_file, max_len=50, sample_size=10000):
  new_lines = []
  with open(txt_file, "r") as read_file:
    lines = read_file.read().split("\t")
    new_lines = ["<sos> " + line.lower() + " <eos>" for line in lines if len(line.split(' ')) <= max_len ]
  return random.sample(new_lines, min(sample_size, len(new_lines)))

In [61]:
# turn a doc into clean tokens
def clean_doc(doc):
	# replace '--' with a space ' '
	doc = doc.replace('--', ' ')
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', string.punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# make lower case
	tokens = [word.lower() for word in tokens]
	return tokens

In [62]:
filename = json_to_txt("reddit_jokes.json")

In [63]:
jokes = line_filter(filename)
print(jokes[10])

<sos> i lost two things today... job in a morgue   . . . . . . . . . . .         and virginity <eos>


In [64]:
print(len(jokes))

10000


In [65]:
tokens = clean_doc('\n'.join(jokes))
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

['sos', 'my', 'exgirlfriend', 'bent', 'my', 'cars', 'hoodbut', 'it', 'was', 'my', 'fault', 'a', 'little', 'bit', 'too', 'i', 'was', 'just', 'a', 'little', 'bit', 'over', 'the', 'legal', 'speed', 'eos', 'sos', 'company', 'suspends', 'sponsorship', 'deal', 'with', 'sharapova', 'after', 'she', 'failed', 'drug', 'test', 'to', 'protect', 'volkswagens', 'excellent', 'brand', 'image', 'they', 'cant', 'affiliate', 'with', 'people', 'who', 'might', 'have', 'cheated', 'eos', 'sos', 'they', 'say', 'out', 'of', 'people', 'text', 'and', 'drive', 'not', 'me', 'i', 'watch', 'youtube', 'videos', 'eos', 'sos', 'pinocchio', 'how', 'did', 'pinocchio', 'find', 'out', 'he', 'had', 'a', 'wooden', 'dick', 'his', 'hand', 'caught', 'on', 'fire', 'eos', 'sos', 'i', 'came', 'up', 'with', 'a', 'suspenseful', 'joke', 'about', 'cheese', 'queso', 'here', 'it', 'goes', 'eos', 'sos', 'whats', 'the', 'best', 'part', 'about', 'twentythree', 'year', 'olds', 'theres', 'twenty', 'of', 'them', 'eos', 'sos', 'what', 'happens

In [66]:
# organize into sequences of tokens
length = 10 + 1
sequences = list()
for i in range(length, len(tokens)):
	# select sequence of tokens
	seq = tokens[i-length:i]
	# convert into a line
	line = ' '.join(seq)
	# store
	sequences.append(line)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 193843


In [67]:
# save tokens to file, one dialog per line
def save_doc(lines, filename):
	data = '\n'.join(lines)
	file = open(filename, 'w')
	file.write(data)
	file.close()

In [68]:
# save sequences to file
out_filename = 'joke_sequences.txt'
save_doc(sequences, out_filename)

In [69]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text
 
# load
in_filename = 'joke_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

In [70]:
from numpy import array
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

In [71]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

In [72]:
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1

In [73]:
# separate into input and output
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

In [74]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 10, 50)            735350    
_________________________________________________________________
lstm_2 (LSTM)                (None, 10, 100)           60400     
_________________________________________________________________
lstm_3 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_2 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_3 (Dense)              (None, 14707)             1485407   
Total params: 2,371,657
Trainable params: 2,371,657
Non-trainable params: 0
_________________________________________________________________
None


In [76]:
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, batch_size=128, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f1b9be06a10>

In [77]:
# save the model to file
model.save('model.h5')
# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))

In [78]:
# load doc into memory
def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text
 
# load cleaned text sequences
in_filename = 'joke_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

In [79]:
seq_length = len(lines[0].split()) - 1

In [80]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

In [81]:
# load the model
model = load_model('model.h5')

In [82]:
# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))

In [195]:
# select a seed text
#seed_text = lines[randint(0,len(lines))]
seed_text = "sos i like my coffee like i like my women"
print(seed_text + '\n')

sos i like my coffee like i like my women



In [196]:
encoded = tokenizer.texts_to_sequences([seed_text])[0]

In [197]:
# predict probabilities for each word
yhat = model.predict_classes(encoded, verbose=0)



In [198]:
out_word = ''
for word, index in tokenizer.word_index.items():
	if index == yhat.any():
		out_word = word
		break

In [199]:
encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')

In [200]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
	result = list()
	in_text = seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		# truncate sequences to a fixed length
		encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
		# predict probabilities for each word
		yhat = model.predict_classes(encoded, verbose=0)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text += ' ' + out_word
		result.append(out_word)
	return ' '.join(result)

In [201]:
# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print(generated)



ground up the door hol the face eos sos what did the leper say to the prostitute who flipped him off eos sos what do you call a group of elephants a cash a furtographer eos sos did you hear about the guy who went to his daughter hair he


In [202]:
print(seed_text)
print(generated)

sos i like my coffee like i like my women
ground up the door hol the face eos sos what did the leper say to the prostitute who flipped him off eos sos what do you call a group of elephants a cash a furtographer eos sos did you hear about the guy who went to his daughter hair he


In [203]:
from google.colab import files
files.download('model.h5')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>