In [1]:
# keras module for building LSTM 
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 

# set seeds for reproducability
from numpy.random import seed
seed(1)

import pandas as pd
import numpy as np
import string, os

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

Using TensorFlow backend.


In [2]:
def text_to_ids(corpus):
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    corpus_as_ids = []
    for line in corpus:
        tokens = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(tokens)):
            n_gram_sequence = tokens[:i+1]
            corpus_as_ids.append(n_gram_sequence)
    return corpus_as_ids, total_words

In [3]:
tokenizer = Tokenizer()

with open('../data/raw/dwight-s1.txt', 'r') as input_d:
    corpus = input_d.readlines()

X, len_vocab = text_to_ids(corpus)

In [4]:
import pickle

# saving
with open('../models/tokenizer_dwight_wordlevel.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [5]:
def sequence_padding(tok_corpus):
    max_len = max([len(x) for x in tok_corpus])
    tok_corpus = np.array(pad_sequences(tok_corpus, maxlen=max_len, padding='pre'))
    
    predictors, label = tok_corpus[:,:-1], tok_corpus[:,-1]
    label = ku.to_categorical(label, num_classes=len_vocab)
    return predictors, label, max_len

predictors, label, max_len = sequence_padding(X)

In [12]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_len, len_vocab)
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 116, 10)           29230     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2923)              295223    
Total params: 368,853
Trainable params: 368,853
Non-trainable params: 0
_________________________________________________________________


In [14]:
model.fit(predictors, label, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x138a346d8>

In [6]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        tokens = tokenizer.texts_to_sequences([seed_text])
        tokens = pad_sequences(tokens, maxlen=max_sequence_len-1, padding='pre')
        
        predicted = model.predict_classes(tokens, verbose=0)
        
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
    return seed_text