https://www.kaggle.com/shivamb/beginners-guide-to-text-generation-using-lstms

In [1]:
# keras module for building LSTM 
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 

# set seeds for reproducability
# from tensorflow import set_random_seed
from numpy.random import seed
# set_random_seed(2)
seed(1)

import pandas as pd
import numpy as np
import string, os 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
with open('lh.atf') as f:
    lines = f.read().split('\n')
prologue_start = 0
prologue_end = lines.index('@law 1')
epilogue_start = lines.index('@epilogue')
epilogue_end = -1
raw_laws = lines[prologue_end:epilogue_start]

In [3]:
all_laws = []
current_law = ''
for line in raw_laws:
    if line.startswith('@law'):
        if len(current_law) > 0: 
            all_laws.append(current_law.rstrip().lower()[:-1]+' <STOP>')
            current_law = ''
    elif line.startswith('#tr.en'):
        current_law += line.split(':')[1].lstrip() + ' '
    else: pass
all_laws.append(current_law)

In [4]:
all_laws[:2]

['if a man a man accused, and murder against him threw, and has not proven it, his accuser shall be killed <STOP>',
 'if a man sorcery against a man threw, and has not proven it, against whom sorcery was thrown to id shall go, into id he shall jump; if id has overcome him, his accuser his household shall carry away; if that man id has cleansed him, and he has emerged whole, who against him sorcery threw shall be killed; who into id jumped the household of his accuser shall carry away <STOP>']

In [5]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(all_laws)
inp_sequences[:10]

[[7, 2],
 [7, 2, 13],
 [7, 2, 13, 2],
 [7, 2, 13, 2, 13],
 [7, 2, 13, 2, 13, 350],
 [7, 2, 13, 2, 13, 350, 5],
 [7, 2, 13, 2, 13, 350, 5, 621],
 [7, 2, 13, 2, 13, 350, 5, 621, 55],
 [7, 2, 13, 2, 13, 350, 5, 621, 55, 20],
 [7, 2, 13, 2, 13, 350, 5, 621, 55, 20, 293]]

In [6]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [7]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 191, 10)           10500     
_________________________________________________________________
lstm (LSTM)                  (None, 100)               44400     
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 1050)              106050    
Total params: 160,950
Trainable params: 160,950
Non-trainable params: 0
_________________________________________________________________


In [21]:
model.fit(predictors, label, epochs=20, verbose=5)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fdc73c24820>

In [22]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    model.reset_states()
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        if output_word == 'stop':
            seed_text += '.'
            break
        seed_text += " "+output_word
    return seed_text

In [25]:
generate_text('If', 55, model, max_sequence_len)

'If a man a young child in the house of a man who has been siezed the footsoldier has not be mina of silver he shall give.'