In [2]:
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding, LSTM, Dense
from keras.models import Sequential
from keras.utils import to_categorical
from keras.optimizers import Adam
import pickle
import numpy as np
import os

In [5]:
file = open("prideandprejeduce.txt", "r", encoding = "utf8")

# store file in list
lines = []
for i in file:
    lines.append(i)

# Convert list to string
data = ""
for i in lines:
  data = ' '. join(lines) 

#replace unnecessary stuff with space
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('“','').replace('”','')  #new line, carriage return, unicode character --> replace by space

#remove unnecessary spaces 
data = data.split()
data = ' '.join(data)
data[:500]

'The Project Gutenberg eBook of Pride and Prejudice, by Jane Austen This eBook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.org. If you are not located in the United States, you will have to check the laws of the country where you are located before using th'

In [6]:
len(data)


698418

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

# saving the tokenizer for predict function
pickle.dump(tokenizer, open('token.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:15]

[1, 176, 158, 916, 3, 321, 4, 1171, 30, 72, 2534, 41, 916, 23, 21]

In [8]:
len(sequence_data)


125309

In [9]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

7030


In [10]:
sequences = []

for i in range(3, len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequences.append(words)
    
print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]

The Length of sequences are:  125306


array([[   1,  176,  158,  916],
       [ 176,  158,  916,    3],
       [ 158,  916,    3,  321],
       [ 916,    3,  321,    4],
       [   3,  321,    4, 1171],
       [ 321,    4, 1171,   30],
       [   4, 1171,   30,   72],
       [1171,   30,   72, 2534],
       [  30,   72, 2534,   41],
       [  72, 2534,   41,  916]])

In [11]:
X = []
y = []

for i in sequences:
    X.append(i[0:3])
    y.append(i[3])
    
X = np.array(X)
y = np.array(y)

In [12]:
print("Data: ", X[:10])
print("Response: ", y[:10])

Data:  [[   1  176  158]
 [ 176  158  916]
 [ 158  916    3]
 [ 916    3  321]
 [   3  321    4]
 [ 321    4 1171]
 [   4 1171   30]
 [1171   30   72]
 [  30   72 2534]
 [  72 2534   41]]
Response:  [ 916    3  321    4 1171   30   72 2534   41  916]


In [13]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=3))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

In [24]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 10)             70300     
                                                                 
 lstm (LSTM)                 (None, 3, 1000)           4044000   
                                                                 
 lstm_1 (LSTM)               (None, 1000)              8004000   
                                                                 
 dense (Dense)               (None, 1000)              1001000   
                                                                 
 dense_1 (Dense)             (None, 7030)              7037030   
                                                                 
Total params: 20,156,330
Trainable params: 20,156,330
Non-trainable params: 0
_________________________________________________________________


In [22]:
from tensorflow import keras
from keras.utils.vis_utils import plot_model

keras.utils.plot_model(model, to_file='plot.png', show_layer_names=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [28]:
from keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint("next_words.h5", monitor='loss', verbose=1, save_best_only=True)
model.compile(loss="sparse_categorical_crossentropy", optimizer=Adam(learning_rate=0.001))
model.fit(X, y, epochs=72, batch_size=64, callbacks=[checkpoint])

Epoch 1/72
Epoch 1: loss improved from inf to 6.27204, saving model to next_words.h5
Epoch 2/72
Epoch 2: loss improved from 6.27204 to 5.65087, saving model to next_words.h5
Epoch 3/72
Epoch 3: loss improved from 5.65087 to 5.29442, saving model to next_words.h5
Epoch 4/72
Epoch 4: loss improved from 5.29442 to 5.05252, saving model to next_words.h5
Epoch 5/72
Epoch 5: loss improved from 5.05252 to 4.84501, saving model to next_words.h5
Epoch 6/72
Epoch 6: loss improved from 4.84501 to 4.64247, saving model to next_words.h5
Epoch 7/72
Epoch 7: loss improved from 4.64247 to 4.43776, saving model to next_words.h5
Epoch 8/72
Epoch 8: loss improved from 4.43776 to 4.23043, saving model to next_words.h5
Epoch 9/72
Epoch 9: loss improved from 4.23043 to 4.01466, saving model to next_words.h5
Epoch 10/72
Epoch 10: loss improved from 4.01466 to 3.79389, saving model to next_words.h5
Epoch 11/72
Epoch 11: loss improved from 3.79389 to 3.56601, saving model to next_words.h5
Epoch 12/72
Epoch 12:

<keras.callbacks.History at 0x1d81f32c5c8>

In [29]:
from keras.models import load_model
import numpy as np
import pickle

# Load the model and tokenizer
model = load_model('next_words.h5')
tokenizer = pickle.load(open('token.pkl', 'rb'))

def Predict_Next_Words(model, tokenizer, text):

  sequence = tokenizer.texts_to_sequences([text])
  sequence = np.array(sequence)
  preds = np.argmax(model.predict(sequence))
  predicted_word = ""
  
  for key, value in tokenizer.word_index.items():
      if value == preds:
          predicted_word = key
          break
  
  print(predicted_word)
  return predicted_word

In [30]:
while(True):
  text = input("Enter your line: ")
  
  if text == "0":
      print("Execution completed.....")
      break
  
  else:
      try:
          text = text.split(" ")
          text = text[-3:]
          print(text)
        
          Predict_Next_Words(model, tokenizer, text)
          
      except Exception as e:
        print("Error occurred: ",e)
        continue

['gutenberg', 'ebook']
Error occurred:  in user code:

    File "c:\Users\sena\anaconda3\envs\mywordpredenv\lib\site-packages\keras\engine\training.py", line 2041, in predict_function  *
        return step_function(self, iterator)
    File "c:\Users\sena\anaconda3\envs\mywordpredenv\lib\site-packages\keras\engine\training.py", line 2027, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\sena\anaconda3\envs\mywordpredenv\lib\site-packages\keras\engine\training.py", line 2015, in run_step  **
        outputs = model.predict_step(data)
    File "c:\Users\sena\anaconda3\envs\mywordpredenv\lib\site-packages\keras\engine\training.py", line 1983, in predict_step
        return self(x, training=False)
    File "c:\Users\sena\anaconda3\envs\mywordpredenv\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\sena\anaconda3\envs\mywordprede