In [4]:
from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.preprocessing import sequence
from keras.layers import Embedding
from keras.optimizers import Adam
from keras.layers import recurrent
from keras.utils.data_utils import get_file
import spacy, seaborn, sklearn, json, pandas as pd, numpy as np, matplotlib
import random
import re
import sys
nlp = spacy.load('en')

Using TensorFlow backend.


In [5]:
import csv
import numpy as np

def read_data(fname):
    with open(fname, 'r') as f:
        text, y = [], []
        reader = csv.reader(f, delimiter=',', quotechar='"')
        for row in reader:
            y.append(int(row[0]) - 1)
            text.append(row[2])
    return text, np.ravel(y)


class SymbolTable:
    """Wrapper for dict to encode unknown symbols"""

    def __init__(self, starting_symbol=2, unknown_symbol=1): 
        self.s       = starting_symbol
        self.unknown = unknown_symbol
        self.d       = dict()

    def lookup_add(self, w):
        if w not in self.d:
            self.d[w] = self.s
            self.s += 1
        return self.d[w]

    def lookup(self, w, strict=False):
        return self.d[w] if strict else self.d.get(w, self.unknown)

    def reverse(self):
        r = {v: k for k, v in self.d.iteritems()}
        r[0], r[1] = '~~NONE~~', '~~UNKNOWN~~'
        return r

    def num_words(self):
        return len(self.d)

    def num_symbols(self):
        return self.s

In [6]:
def vectorize(data, word_idx, title_maxlen, body_maxlen):
    ts = []
    bs = []
    for title, body in data:
        t = [word_idx[w] for w in title]
        b = [word_idx[w] for w in body]
        ts.append(t)
        bs.append(b)
    return pad_sequences(ts, maxlen=title_maxlen), pad_sequences(bs, maxlen=body_maxlen)

In [7]:
#read data and preprocess, concatenate titles and bodies
with open('joke-dataset/reddit_jokes.json', 'r') as f:
    joke_txt = json.load(f)
train = []
for joke in joke_txt:
    if(joke['score'] > 5 and len(joke['body']) < 100):
        title = [x.lower() for x in re.findall(r"[\w']+|[.,!?;]", joke['title'])]
        body = [x.lower() for x in re.findall(r"[\w']+|[.,!?;]", joke['body'])]
        train.append(title + body)

In [9]:
#creates the vocabulary and vocab-index/index-vocab dicts
most_common = {}
with open("count_1w.txt") as f:
    for line in f:
        (key, val) = line.split()
        most_common[str(key)] = val
        
vocab = set()
for word in train:
    vocab.update(word)
vocab = sorted(vocab)

for tup in enumerate(vocab):
    if(most_common.get(tup[1]) == None):
        vocab[tup[0]] = '~~unknown~~'
vocab = set(vocab)
vocab_indices = dict((w, i) for i, w in enumerate(vocab))
indices_vocab = dict((i, w) for i, w in enumerate(vocab))

In [11]:
max_len = 5
step = 2
X = []
Y = []
for joke in train:
    for i in range(0, len(joke) - max_len, step):
        seq_in = joke[i:i + max_len]
        seq_out = joke[i + max_len]
        X.append(seq_in)
        Y.append(seq_out)
n_patterns = len(X)

In [12]:
len(X)

339944

In [13]:
X[1]

['the', 'king', 'of', 'all', 'school']

In [14]:
Y[1]

'supplies'

In [15]:
start = np.random.randint(0, len(train)-1)
pattern = train[start]
pattern

['i',
 'had',
 'an',
 'epiphany',
 'while',
 'in',
 'the',
 'strip',
 'club',
 '.',
 '.',
 '.',
 '.',
 '.',
 '.',
 'but',
 'i',
 "don't",
 'think',
 'that',
 'was',
 'her',
 'real',
 'name',
 '.']

In [16]:
for i in range(1000):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = np.divide(x,len(vocab))
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = indices_vocab[index]
    seq_in = [indices_vocab[value] for value in pattern]
    print(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]

TypeError: ufunc 'true_divide' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [11]:
# Convert text to integer symbols
symbol_table = SymbolTable()

def preprocess_text(parsed_text, symbol_table, init=True):
    mapper = symbol_table.lookup_add if init else symbol_table.lookup
    return [[mapper(w.lower()) for w in s] for s in parsed_text]

In [12]:
symbols_train = preprocess_text(train, symbol_table, True)

In [15]:
symbols_train[0]

[2, 3, 4, 5, 6, 7, 8, 9, 10, 4, 11]

In [22]:
MAX_LENGTH = 50
x_train = sequence.pad_sequences(symbols_train, maxlen=MAX_LENGTH)

In [23]:
d = 100
model = Sequential()
model.add(Embedding(symbol_table.num_symbols(), output_dim=d))
model.add(LSTM(d))
model.add(Dense(4, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy',
              optimizer=Adam(lr=0.0005),
              metrics=['accuracy'])

In [None]:
#Keras LSTM
maxlen = 20
step = 4
sentences = []
next_word = []
for i in range(0, len(train) - maxlen, step):
    sentences.append(train[i: i + maxlen])
    next_word.append(train[i + maxlen])
print('nb sequences:', len(sentences))

In [None]:
#Keras LSTM
print('Vectorization...')
X = np.zeros((len(sentences)), int(maxlen), len(vocab), dtype=np.bool)
y = np.zeros((len(sentences), len(vocab)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, word in enumerate(sentence):
        X[i, t, vocab_indices[word]] = 1
    y[i, vocab_indices[next_word[i]]] = 1

In [None]:
vocab_size = len(vocab) + 1
word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
title_maxlen = max(map(len, (x for x, _ in train)))
body_maxlen = max(map(len, (x for _, x in train)))

In [None]:
x, xq = vectorize(train, word_idx, title_maxlen, body_maxlen)

In [None]:
from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys

In [None]:
path = get_file('nietzsche.txt', origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
text = open(path).read().lower()
print('corpus length:', len(text))

chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))
print(text[1])

In [None]:
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

In [None]:
print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

In [None]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [None]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
for iteration in range(1, 60):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    model.fit(X, y,
              batch_size=128,
              epochs=1)

    start_index = random.randint(0, len(text) - maxlen - 1)

    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print()
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentenc\
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x[0, t, char_indices[char]] = 1.

            preds = model.predict(x, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()