In [1]:
#############################################
#
# Run this after running get_episodes.py
#
#############################################

import numpy as np

from keras.layers import Input, Dense, LSTM, Embedding, Activation, Softmax
from keras.models import Model

from keras.callbacks import ModelCheckpoint, LambdaCallback
from keras.preprocessing.text import Tokenizer

import os

Using TensorFlow backend.


In [2]:
def clean_episode_text(text, header_length=0):
    """
    Clean the text of an episode.

    :param text:
    :param header_length:
    :return:
    """

    # Remove any leading newlines or whitespaces at the beginning and end of the episode text.
    text = text.strip()

    # Insert a space before these punctuation marks so they get treated as a word.
    for c in '.!?),':
        text = text.replace(c, ' ' + c)

    # Treat these two a special cases
    text = text.replace(')', ' )')
    text = text.replace('\n', ' \n ')

    # If the "cleaning" I just did put two or more spaces next to each other, get rid of that.
    while '  ' in text:
        text = text.replace('  ', ' ')

    text = ('episodebeginning ' * header_length) + text + (' episodeend' * header_length)

    return text

In [10]:
def load_data(
    episodes_path='episodes', 
    sequence_length=200
):
    texts = [clean_episode_text(open(os.path.join(episodes_path, f)).read()) for f in os.listdir(episodes_path)]

    tkn = Tokenizer(num_words=50000, filters='"#$%&*+-/:;<=>@[\\]^_`{|}~\t')

    tkn.fit_on_texts(texts)

    seqs = tkn.texts_to_sequences(texts)

    sub_texts = []
    for seq in seqs:
        sub_texts += [
            (seq[i:i+sequence_length], seq[i+sequence_length], [j/len(seq) for j in range(i, i+sequence_length)])
            for i in range(len(seq) - sequence_length)
        ]

    X = np.array([s[0] for s in sub_texts])
    y = np.array([s[1] for s in sub_texts])

    return X, y, tkn

In [11]:
def build_model(vocab_size):
    inputs = Input(shape=(32,))

    x = Embedding(output_dim=128, input_dim=vocab_size, input_length=32)(inputs)
    x = LSTM(128)(x)
    x = Dense(128)(x)
    x = Activation('relu')(x)
    x = Dense(vocab_size)(x)
    predictions = Softmax()(x)

    model = Model(inputs=inputs, outputs=predictions)

    model.compile(optimizer='rmsprop',
                  loss='sparse_categorical_crossentropy')

    return model

In [12]:
def random_sentence(tkn, sentence_len=25):
    arr = np.random.randint(0, vocab_size, size=32)
    result = []

    for i in range(sentence_len):
        next_word = model.predict(arr.reshape(1, -1)).argmax(axis=1)
        result.append(next_word)
        arr = np.insert(arr[1:], 31, next_word, axis=0)

    sentence = tkn.sequences_to_texts(result)
    sentence = ' '.join(sentence)

    for c in '.!?(),':
        sentence = sentence.replace(' ' + c, c)

    sentence = sentence.replace('\n', ' \n ')

    return sentence

In [13]:
X, y, tokenizer = load_data()
vocab_size = len(tokenizer.word_index) + 1

KeyboardInterrupt: 

In [None]:
callbacks = [LambdaCallback(on_epoch_end=lambda epoch, logs: print('\n', random_sentence(tokenizer, 50), '\n'))]

In [None]:
model = build_model(vocab_size)

In [None]:
model.fit(X[:1000], y[:1000],
          epochs=124,
          batch_size=100,
          callbacks=callbacks)

In [None]:
model.save('sabrinai_2.hdf5')

print(random_sentence(tokenizer, 1000))

In [11]:
model.predict(X[3])

ValueError: Error when checking input: expected input_1 to have shape (32,) but got array with shape (1,)

In [None]:
X.sh