In [1]:
import pickle
import math
import pandas as pd
import numpy as np
from numpy import array
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
import nltk

from sklearn.feature_extraction.text import CountVectorizer

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding

from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

ModuleNotFoundError: No module named 'tensorflow.keras'

In [46]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename,'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [47]:
# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [48]:
def seperate_verses(corpus):  
    # Separating verses
    split_corpus = corpus.split('\n\n')
    return split_corpus

In [49]:
# turn a doc into clean tokens
def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    
    # prepare regex for char filtering
    re_punc = re.compile('[%s]'% re.escape(string.punctuation))
    
    # remove punctuation from each word
    tokens = [re_punc.sub('', w) for w in tokens]
    
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens 
              if word.isalpha()]
    
    # make lower case
    tokens = [word.lower() for word in tokens]
    
    return tokens

In [50]:
def clean_corpus(corpus):
    corpus = seperate_verses(corpus)
    cleaned_corpus = []
    for i in range(len(corpus)):
        cleaned_doc = clean_doc(corpus[i])
        cleaned_corpus.append(cleaned_doc)
        
    return cleaned_corpus

In [51]:
def corpus_docs_list_to_str(corpus):
    new_corpus = []
    for i in range(len(corpus)):
        new_doc = ' '.join(corpus[i])
        new_corpus.append(new_doc)
        
    return new_corpus

In [52]:
def corpus_to_tokens_to_sequences(corpus,sequence_length=50):
    tokens = ' '.join(corpus)
    tokens = tokens.split()
    # organize into sequences of tokens
    length = sequence_length + 1
    sequences = list()
    for i in range(length, len(tokens)):
        # select sequence of tokens
        seq = tokens[i-length:i]
        # convert into a line
        line = ' '.join(seq)
        # store
        sequences.append(line)
    print('Total Sequences: %d' % len(sequences))
    return sequences

In [132]:
bible = load_doc('kj_bible.txt')

In [133]:
# Seperating bible into old_testament
old_testament = bible[:3334014]
old_testament = old_testament[old_testament.find('1'):]

In [134]:
# Seperating bible into new_testament
new_testament = bible[3334014:]
new_testament = new_testament[new_testament.find('1'):]

In [135]:
# Clean entire old_testament
clean_old_testament = clean_corpus(old_testament)

# Clean entire new_testament
clean_new_testament = clean_corpus(new_testament)

In [136]:
# Converting verses from lists to strings
clean_old_testament = corpus_docs_list_to_str(clean_old_testament)
clean_new_testament = corpus_docs_list_to_str(clean_new_testament)

In [139]:
# saving new_testament sequences to txt file
sequences = corpus_to_tokens_to_sequences(clean_new_testament)
out_filename = 'new_test_sequences.txt'
save_doc(sequences, out_filename)

Total Sequences: 180528


In [91]:
# # saving old_testament sequences to txt file
# sequences = corpus_to_tokens_to_sequences(clean_old_testament)
# out_filename = 'old_test_sequences.txt'
# save_doc(sequences, out_filename)

Total Sequences: 609366


In [140]:
sequences

['the book of the generation of jesus christ the son of david the son of abraham abraham begat isaac and isaac begat jacob and jacob begat judas and his brethren and judas begat phares and zara of thamar and phares begat esrom and esrom begat aram and aram begat aminadab and',
 'book of the generation of jesus christ the son of david the son of abraham abraham begat isaac and isaac begat jacob and jacob begat judas and his brethren and judas begat phares and zara of thamar and phares begat esrom and esrom begat aram and aram begat aminadab and aminadab',
 'of the generation of jesus christ the son of david the son of abraham abraham begat isaac and isaac begat jacob and jacob begat judas and his brethren and judas begat phares and zara of thamar and phares begat esrom and esrom begat aram and aram begat aminadab and aminadab begat',
 'the generation of jesus christ the son of david the son of abraham abraham begat isaac and isaac begat jacob and jacob begat judas and his brethren and j

In [141]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sequences)
sequences = tokenizer.texts_to_sequences(sequences)

# vocabulary size
vocab_size = len(tokenizer.word_index) + 1
 
# separate into input and output
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]
 
# define model
model = Sequential([
    Embedding(vocab_size, 50, input_length=seq_length),
    LSTM(100, return_sequences=True),
    LSTM(100),
    Dense(100, activation='relu'),
    Dense(vocab_size, activation='softmax')
])

# model.add(Embedding(vocab_size, 50, input_length=seq_length))
# model.add(LSTM(100, return_sequences=True))
# model.add(LSTM(100))
# model.add(Dense(100, activation='relu'))
# model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit model
model.fit(X, y, batch_size=128, epochs=100)
 
# save the model to file
model.save('model.h5')
# save the tokenizer
pickle.dump(tokenizer, open('tokenizer.pkl', 'wb'))

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 50, 50)            300350    
_________________________________________________________________
unified_lstm_6 (UnifiedLSTM) (None, 50, 100)           60400     
_________________________________________________________________
unified_lstm_7 (UnifiedLSTM) (None, 100)               80400     
_________________________________________________________________
dense_6 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_7 (Dense)              (None, 6007)              606707    
Total params: 1,057,957
Trainable params: 1,057,957
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/1

KeyboardInterrupt: 

In [142]:
# save the model to file
model.save('model.h5')
# save the tokenizer
pickle.dump(tokenizer, open('tokenizer.pkl', 'wb'))

In [143]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

In [164]:
# select a seed text
seed_text = clean_new_testament[randint(0,len(clean_new_testament))]
print(seed_text + '\n')

# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 20)
print(generated)

and so it came to pass that they escaped all safe to land

and he went out and entered into the temple and led him away to the region round about and the


In [197]:
# select a seed text
seed_text = clean_new_testament[randint(0,len(clean_new_testament))]
print(seed_text + '\n')

# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print(generated)

now herod the tetrarch heard of all that was done by him and he was perplexed because that it was said of some that john was risen from the dead and of some that elias had appeared and of others that one of the old prophets was risen again

and the third angel of the jews informed him in the temple and led him away and he went out and led him away to joppa and he went into the temple and led him away and the other the commandments and the scribes and the scribes and the scribes


In [171]:
# generate new text
generated = generate_seq(model, tokenizer, seq_length, 'and thou shalt have joy and sadness', 25)
print(generated)

be content in the provocation for the winefat and running to the father and breaking them in the wilderness course and sacrifices and the twelve


In [172]:
# generate new text
generated = generate_seq(model, tokenizer, seq_length, 'that which was from the beginning which we have heard which we have seen with our eyes', 25)
print(generated)

inasmuch as the sufferings of the lord jesus christ who is the firstfruits of the gentiles and of the earth for the flesh is not


In [175]:
# generate new text
generated = generate_seq(model, tokenizer, seq_length, 'god is our refuge and strength an ever-present help in trouble', 25)
print(generated)

for the world waiteth for the saints and the deceitfulness of riches and the length is in the lord and the lord jesus and the


In [176]:
# generate new text
generated = generate_seq(model, tokenizer, seq_length, 'the name of the lord is a strong tower the righteous run into and are safe', 25)
print(generated)

of heart and the rain descended and the length is gone out of the stern and wished lasciviousness and the angel of the lord jesus


In [177]:
# generate new text
generated = generate_seq(model, tokenizer, seq_length, 'do not grieve for the joy of the lord is your strength', 25)
print(generated)

and revelation and anger with the spirit of god and of the gentiles and of the earth for the flesh is not slack but the


In [182]:
# generate new text
generated = generate_seq(model, tokenizer, seq_length, 'the lord giveth and the lord taketh away', 25)
print(generated)

the people and the holy ghost and the seven stars of god and of the people and of the jews and the eunuch and the


In [181]:
# generate new text
generated = generate_seq(model, tokenizer, seq_length, 'to answer before listening that is folly and shame', 25)
print(generated)

them not for the word of god for the remission of sins and the lord said unto him that is in heaven and the father


In [183]:
# generate new text
generated = generate_seq(model, tokenizer, seq_length, 'fight for the good of faith take hold of eternal life to which you were called when you made your good confession in the presence of many witnesses', 25)
print(generated)

i have written unto you for the spirit of god and the father and the lord jesus christ and the lord jesus christ and the


In [184]:
# generate new text
generated = generate_seq(model, tokenizer, seq_length, 'I consider that our present sufferings are not worth comparing with the glory that will be revealed in us', 25)
print(generated)

and peace from god for the son of man shall be saved and the cock crew and the mother of the jews and the scribes


In [186]:
# generate new text
generated = generate_seq(model, tokenizer, seq_length, 'and god said let there be light and there was light', 25)
print(generated)

in the wilderness and of the jews and the eunuch said unto him friend why hast thou forsaken the way of jesus and the chief


In [187]:
# generate new text
generated = generate_seq(model, tokenizer, seq_length, 'and god saw everything that he had made and behold', 25)
print(generated)

a man named demetrius who informed you a little while and the rough mother was nigh to the battle and they said unto them why


In [188]:
# generate new text
generated = generate_seq(model, tokenizer, seq_length, 'so god created man in his own image in the image of god created he him male and female created he them', 25)
print(generated)

that are asleep and the lord of the lord jesus christ and the lord jesus christ and the lord jesus christ and the son of


In [189]:
# generate new text
generated = generate_seq(model, tokenizer, seq_length, 'hear us my lord thou art a mighty prince among us', 25) 
print(generated)

for the flesh is not in the law and the grace of god for the flesh is not in the law and the grace of


In [193]:
# generate new text
generated = generate_seq(model, tokenizer, seq_length, 'and the whole earth was of one language and of one speech', 100)
print(generated)

and alexander and the angel of the lord was not with the grecians against the right hand of god and overthrew the tables of the moneychangers and the seats of them that sat in the midst of the sea and the third part of the twelve apostles and when they had passed up and fell down and worshipped him saying i am not worthy to unloose i will have given me the same man that is in heaven and earth shall be given to the lord and he that hath not been not yet is the children of god and


In [194]:
with open('new_testament_first_model.pickle','wb') as f:
    pickle.dump(model,f)

TypeError: can't pickle _thread.RLock objects

In [1]:
# generate new text
generated = generate_seq(model, tokenizer, seq_length, 'and the whole earth was of one language and of one speech', 100)
print(generated)

NameError: name 'generate_seq' is not defined