In [1]:
def read_file(filepath):
    with open(filepath) as f:
        str_text = f.read()
    return str_text


In [2]:
# read_file('moby_dick_four_chapters.txt')

In [3]:
import spacy

In [5]:
import tensorflow as tf

In [6]:
import en_core_web_sm
nlp = en_core_web_sm.load()


In [7]:
nlp.max_length = 1198623


In [8]:
def separate_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']


In [9]:
d = read_file('moby_dick_four_chapters.txt')

In [10]:
tokens = separate_punc(d)

In [12]:
tokens

['call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to',
 'interest',
 'me',
 'on',
 'shore',
 'i',
 'thought',
 'i',
 'would',
 'sail',
 'about',
 'a',
 'little',
 'and',
 'see',
 'the',
 'watery',
 'part',
 'of',
 'the',
 'world',
 'it',
 'is',
 'a',
 'way',
 'i',
 'have',
 'of',
 'driving',
 'off',
 'the',
 'spleen',
 'and',
 'regulating',
 'the',
 'circulation',
 'whenever',
 'i',
 'find',
 'myself',
 'growing',
 'grim',
 'about',
 'the',
 'mouth',
 'whenever',
 'it',
 'is',
 'a',
 'damp',
 'drizzly',
 'november',
 'in',
 'my',
 'soul',
 'whenever',
 'i',
 'find',
 'myself',
 'involuntarily',
 'pausing',
 'before',
 'coffin',
 'warehouses',
 'and',
 'bringing',
 'up',
 'the',
 'rear',
 'of',
 'every',
 'funeral',
 'i',
 'meet',
 'and',
 'especially',
 'whenever',
 'my',
 'hypos',
 'get',
 'such',
 'an',
 'upper',
 'hand',
 '

In [11]:
len(tokens)

11338

In [13]:
# 25 words --> network predict #26

In [14]:
train_len = 25+1
text_sequences = []
for i in range(train_len,len(tokens)):
    seq = tokens[i-train_len:i]
    text_sequences.append(seq)


In [15]:
type(text_sequences)

list

In [16]:
text_sequences[1]

['me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to',
 'interest',
 'me',
 'on',
 'shore']

In [17]:
' '.join(text_sequences[1])

'me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on shore'

In [18]:
from keras.preprocessing.text import Tokenizer


Using TensorFlow backend.


In [19]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_sequences)


In [20]:
sequences = tokenizer.texts_to_sequences(text_sequences)

In [21]:
# sequences[1]

In [22]:
for i in sequences[0]:
    print(f"{i}:{tokenizer.index_word[i]}")

# tokenizer.index_word


956:call
14:me
263:ishmael
51:some
261:years
408:ago
87:never
219:mind
129:how
111:long
954:precisely
260:having
50:little
43:or
38:no
315:money
7:in
23:my
546:purse
3:and
150:nothing
259:particular
6:to
2712:interest
14:me
24:on


In [23]:
# tokenizer.word_counts


In [24]:
vocabulary_size = len(tokenizer.word_counts)


In [25]:
vocabulary_size

2717

In [26]:
import numpy as np
sequences = np.array(sequences)


In [27]:
sequences


array([[ 956,   14,  263, ..., 2712,   14,   24],
       [  14,  263,   51, ...,   14,   24,  957],
       [ 263,   51,  261, ...,   24,  957,    5],
       ...,
       [ 952,   12,  166, ...,  262,   53,    2],
       [  12,  166, 2711, ...,   53,    2, 2717],
       [ 166, 2711,    3, ...,    2, 2717,   26]])

In [28]:
from keras.utils import to_categorical


In [29]:
X = sequences[:,:-1]

In [30]:
y = sequences[:,-1]

In [31]:
y = to_categorical(y,num_classes=vocabulary_size+1)


In [32]:
seq_len = X.shape[1]


In [33]:
X.shape


(11312, 25)

In [34]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,Embedding


In [35]:
def create_model(vocabulary_size,seq_len):
    model = Sequential()
    model.add(Embedding(vocabulary_size,seq_len,input_length=seq_len))
    model.add(LSTM(50, return_sequences=True))
    model.add(LSTM(50))
    model.add(Dense(50,activation = 'relu'))
    
    model.add(Dense(vocabulary_size, activation = 'softmax'))
    
    model.compile(loss='categorical_crossentropy',optimizer= 'adam',metrics=['accuracy'])
    
    model.summary()
    
    return model


In [36]:
model = create_model(vocabulary_size+1,seq_len)


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 25)            67950     
_________________________________________________________________
lstm (LSTM)                  (None, 25, 50)            15200     
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense (Dense)                (None, 50)                2550      
_________________________________________________________________
dense_1 (Dense)              (None, 2718)              138618    
Total params: 244,518
Trainable params: 244,518
Non-trainable params: 0
_________________________________________________________________


In [37]:
from pickle import dump,load

In [38]:
model.fit(X,y,batch_size=128,epochs=2,verbose=1)

Train on 11312 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x1c1d878a6a0>

In [39]:
model.save('my_mobydick_model.h5')

In [40]:
dump(tokenizer,open('my_simpletokenizer','wb'))

In [41]:
from keras.preprocessing.sequence import pad_sequences

In [42]:
def generate_text(model,tokenizer,seq_len,seed_text,num_gen_words):
    output_text = []
    input_text = seed_text
    for i in range(num_gen_words):
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        pad_encoded = pad_sequences([encoded_text], maxlen = seq_len,truncating= 'pre')
        pred_word_ind = model.predict_classes(pad_encoded,verbose=0)[0]
        pred_word = tokenizer.index_word[pred_word_ind]
        input_text += ' ' + pred_word
        output_text.append(pred_word)
    return ' '.join(output_text)


In [43]:
text_sequences[0]

['call',
 'me',
 'ishmael',
 'some',
 'years',
 'ago',
 'never',
 'mind',
 'how',
 'long',
 'precisely',
 'having',
 'little',
 'or',
 'no',
 'money',
 'in',
 'my',
 'purse',
 'and',
 'nothing',
 'particular',
 'to',
 'interest',
 'me',
 'on']

In [44]:
import random
random.seed(101)
random_pick = random.randint(0,len(text_sequences))


In [45]:
random_seed_text = text_sequences[random_pick]

In [46]:
random_seed_text

['thought',
 'i',
 'to',
 'myself',
 'the',
 'man',
 "'s",
 'a',
 'human',
 'being',
 'just',
 'as',
 'i',
 'am',
 'he',
 'has',
 'just',
 'as',
 'much',
 'reason',
 'to',
 'fear',
 'me',
 'as',
 'i',
 'have']

In [47]:
seed_text = ' '.join(random_seed_text)

In [48]:
seed_text

"thought i to myself the man 's a human being just as i am he has just as much reason to fear me as i have"

In [49]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=25)

'the the the the the the the the the the the the the the the the the the the the the the the the the'

In [54]:
from tensorflow.keras.models import load_model

In [56]:
model = load_model('epochBIG.h5')



In [59]:
tokenizer = load(open('epochBIG','rb'))

In [60]:
generate_text(model,tokenizer,seq_len,seed_text=seed_text,num_gen_words=25)


"to be seen there was no bad olfactories my own letter was cheerily listening over his hearers who 's more can go have a wearing"