In [9]:
# keras module for building LSTM 
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 

# set seeds for reproducability
from numpy.random import seed
seed(1)

import sentencepiece as spm
import pandas as pd
import numpy as np
import string, os

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [10]:
sp = spm.SentencePieceProcessor()
sp.load('m-2000.model')

True

In [18]:
def text_to_ids(processor, corpus):
    total_words = sp.get_piece_size()
    
    corpus_as_ids = []
    for line in corpus:
        subtokens = sp.encode_as_ids(line)
        for i in range(1, len(subtokens)):
            n_gram_sequence = subtokens[:i+1]
            corpus_as_ids.append(n_gram_sequence)
    return corpus_as_ids, total_words

In [19]:
with open('../data/raw/dwight-s1.txt', 'r') as input_d:
    corpus = input_d.readlines()

X, len_vocab = text_to_ids(sp, corpus)

In [61]:
def sequence_padding(tok_corpus):
    max_len = max([len(x) for x in tok_corpus])
    tok_corpus = np.array(pad_sequences(tok_corpus, maxlen=max_len, padding='pre'))
    
    predictors, label = tok_corpus[:,:-1], tok_corpus[:,-1]
    label = ku.to_categorical(label, num_classes=len_vocab)
    return predictors, label, max_len

predictors, label, max_len = sequence_padding(X)

In [65]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_len, len_vocab)
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 196, 10)           20000     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               44400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 2000)              202000    
Total params: 266,400
Trainable params: 266,400
Non-trainable params: 0
_________________________________________________________________


In [67]:
model.fit(predictors, label, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x139e6fba8>

In [80]:
def generate_text(processor, seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        subtokens = sp.encode_as_ids(seed_text)
        subtokens = pad_sequences([subtokens], maxlen=max_sequence_len-1, padding='pre')
        
        predicted = model.predict_classes(subtokens, verbose=0)
        
        output_word = processor.decode_ids([int(predicted)])
        
        seed_text += " " + output_word
    return seed_text.title()

In [123]:
print(generate_text(sp, 'what', 25, model, max_len))

What You ' Rote                      
