In [13]:
import tensorflow as tf
import string
import numpy as np
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
from keras.preprocessing.sequence import pad_sequences

In [14]:
path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [15]:
text = None
with open(path_to_file) as f:
    text = ''.join(f.readlines())

In [16]:
def get_vocabulary(text: str):
    text = text.lower()
    words = text.split()
    table = str.maketrans('', '', string.punctuation)
    words = [w.translate(table) for w in words]
    words = [word for word in words if word.isalpha()]
    return words

In [17]:
tokens = get_vocabulary(text)

In [18]:
length = 50 + 1
lines = []

for i in range(length, len(tokens)):
    sequence = tokens[i-length:i]
    line = ' '.join(sequence)
    lines.append(line)

<b>Prepare Dataset</b>

In [19]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = np.array(tokenizer.texts_to_sequences(lines))

In [20]:
vocab_size = len(tokenizer.word_index) + 1

In [21]:
x, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

<b>Build LSTM Model</b>

In [27]:
model = Sequential(
    layers=[
        Embedding(vocab_size, 50, input_length=x.shape[1]),
        LSTM(100, return_sequences=True),
        LSTM(100),
        Dense(100, activation='relu'),
        Dense(vocab_size, activation='softmax')
    ]
)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(x, y, epochs=2)

In [29]:
def generate_text_sequence(model, tokenizer, text_seq_length, seed_text, n_words):
    text = []

    for _ in range(n_words):
        encoded = tokenizer.texts_to_sequences([seed_text])[0]
        encoded = pad_sequences([encoded], maxlen=text_seq_length, truncating='pre')
        y_pred = model.predict_classes(encoded, verbose=0)
        predicted_word = ''
        for word, index in tokenizer.word_index.items():
            if index == y_pred:
                predicted_word = word
                break
        seed_text += ' ' + predicted_word
        text.append(predicted_word)
    return ' '.join(text)