## Steps:

1. process text
2. clean text
3. tokenize text and create sequences with Keras
4. create LSTM based model
5. split text to features and labels: X features => first $n$ words of a sequence, Y label => first word after a sequence
6. fit the model
7. save the model and toenkizer
8. load model and tokenizer
9. generate new text based off a seed input

In [None]:
def read_file(filepath):
    with open(filepath) as f:
        str_text = f.read()
    
    return str_text

In [None]:
# read_file('moby_dick_four_chapters.txt')

In [None]:
# !conda install spacy -y
# !python -m spacy download en
# !python -m spacy download en_core_web_sm
# !python -m spacy download en_core_web_md
# !python -m spacy download en_core_web_lg
# !conda install nltk -y
# 

In [None]:
import spacy

# only need for tokenization and cleaning

nlp = spacy.load('en_core_web_md', disable=['parser','tagger','ner'])

In [None]:
nlp.max_length = 1198623 

In [None]:
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [None]:
# d = read_file('melville-moby_dick.txt')
d = read_file('moby_dick_four_chapters.txt')


In [None]:
tokens = separate_punc(d)

In [None]:
len(tokens)

In [None]:
# input: 25 words --> target: next word
train_len = 25+1

text_sequences = []

for i in range(train_len, len(tokens)):
    # since training length is n + 1, the
    # seq variable is n+1 tokens from index 0 to n 
    seq = tokens[i-train_len:i]
    
    text_sequences.append(seq)

In [None]:

type(text_sequences), len(text_sequences)

In [None]:
print('seq 0:', ' '.join(text_sequences[0]))
print('seq 1:', ' '.join(text_sequences[1]))

In [None]:
from keras.preprocessing.text import Tokenizer

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)

In [None]:
# this maps words in strings/sequences to numbers
# these numbers represent IDs for a particular word
sequences = tokenizer.texts_to_sequences(text_sequences)

In [None]:
for idx in sequences[0]:
    print(f"{idx:5} : {tokenizer.index_word[idx]}")

In [None]:
print(f"idx, count: word")
print(f"----------------")
j=0
for idx, (k,v) in enumerate([(k,v) for k,v in tokenizer.word_counts.items()]):
    print(f"{j:3}, {v:5}: {k:15}")
    
    j+=1
    if j > 10:
        break
    

In [None]:
vocabulary_size = len(tokenizer.word_counts)
vocabulary_size

In [None]:

type(sequences)

In [None]:
import numpy as np

In [None]:
sequences = np.array(sequences)

In [None]:
sequences

In [None]:
sequences.shape

In [None]:
from tensorflow.keras.utils import to_categorical

In [None]:
# grab everything except the last word from each colum
X = sequences[:, :-1]

# grab just the last column
y = sequences[:, -1]

In [None]:
y = to_categorical(y,num_classes=vocabulary_size+1)

In [None]:
seq_len = X.shape[1]

In [None]:
y.shape

In [None]:
X.shape

In [None]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Dropout

In [None]:
def create_model(vocabulary_size, seq_len, multiplier=4):
    
    model = Sequential()
    model.add(Embedding(input_dim=vocabulary_size, output_dim=seq_len, input_length=seq_len))
    model.add(LSTM(seq_len*multiplier, return_sequences=True))
    model.add(Dropout(0.1))
    model.add(LSTM(seq_len*multiplier, return_sequences=True))
    model.add(Dropout(0.1))
    model.add(LSTM(seq_len*multiplier))
    model.add(Dropout(0.1))
    model.add(Dense(seq_len*multiplier, activation='relu'))
    
    model.add(Dense(vocabulary_size, activation = 'softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    model.summary()
    
    return model
    

In [None]:
model = create_model(vocabulary_size+1, seq_len,6)

In [None]:
from pickle import dump, load

In [None]:
model.fit(X,y, batch_size = 256, epochs=10, verbose=1)

In [None]:
model.save('test_my_mobydick_model.h5')

In [None]:
dump(tokenizer,open('test_my_simple_tokenizer','wb'))

In [None]:
from keras.preprocessing.sequence import pad_sequences

In [None]:
def generate_text(model, tokenizer, seq_len, seed_text, num_gen_words):
    output_text = []
    
    input_text = seed_text
    
    for i in range(num_gen_words):
        
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        pad_encoded = pad_sequences([encoded_text],maxlen=seq_len, truncating='pre')
        
        predict_probs=model.predict(pad_encoded)
        pred_word_index=np.argmax(predict_probs,axis=1)[0]
        
#         pred_word_index = model.predict_classes(pad_encoded,verbose=0)[0]
        
        pred_word = tokenizer.index_word[pred_word_index]
        
        input_text += ' ' + pred_word
        
        output_text.append(pred_word)
    
    return ' '.join(output_text)

In [None]:
import random
random.seed(23)
random_pick = random.randint(0,len(text_sequences))
random_pick

In [None]:
random_seed_text=text_sequences[random_pick]
random_seed_text

In [None]:
seed_text = ' '.join(random_seed_text)

In [None]:
seed_text

In [None]:
# model with 2 epochs
generate_text(model, tokenizer, seq_len, seed_text=seed_text,num_gen_words = 25)

In [None]:
from keras.models import load_model

In [None]:
seed_text

In [None]:
seq_len

In [None]:
import time

tic = time.perf_counter()
model.fit(X,y, batch_size = 128, epochs=300, verbose=1)
toc = time.perf_counter()
model.save('test_epochBIG.h5')
dump(tokenizer,open('test_epochBIG','wb'))

In [None]:
print(f"Trained the model in {(toc - tic)/60:0.4f} minutes")

In [None]:
model = load_model('test_epochBIG.h5')
tokenizer = load(open('test_epochBIG','rb'))

In [None]:
# model with 500 epochs
generate_text(model, tokenizer, seq_len, seed_text=seed_text,num_gen_words = 25)