In [19]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os


In [20]:
file = open("alice_in_wonderland.txt", "r", encoding = "utf8")

# store file in list
lines = []
for i in file:
    lines.append(i)

# Convert list to string
data = ""
for i in lines:
  data = ' '. join(lines) 

#replace unnecessary stuff with space
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('“','').replace('”','')  #new line, carriage return, unicode character --> replace by space

#remove unnecessary spaces 
data = data.split()
data = ' '.join(data)
data[:500]

'The Project Gutenberg EBook of Alice in Wonderland, by Lewis Carroll This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.org Title: Alice in Wonderland Author: Lewis Carroll Illustrator: Gordon Robinson Release Date: August 12, 2006 [EBook #19033] Language: English *** START OF THIS PROJECT GUTENBERG EBOOK '

In [21]:
len(data)

71670

In [22]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

# saving the tokenizer for predict function
pickle.dump(tokenizer, open('token.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:15]

[1, 22, 21, 180, 5, 10, 7, 277, 37, 554, 555, 19, 180, 26, 25]

In [23]:
len(sequence_data)

13074

In [24]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

2080


In [25]:
sequences = []

for i in range(3, len(sequence_data)):
    words = sequence_data[i-3:i+1]
    sequences.append(words)
    
print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]

The Length of sequences are:  13071


array([[  1,  22,  21, 180],
       [ 22,  21, 180,   5],
       [ 21, 180,   5,  10],
       [180,   5,  10,   7],
       [  5,  10,   7, 277],
       [ 10,   7, 277,  37],
       [  7, 277,  37, 554],
       [277,  37, 554, 555],
       [ 37, 554, 555,  19],
       [554, 555,  19, 180]])

In [26]:
X = []
y = []

for i in sequences:
    X.append(i[0:3])
    y.append(i[3])
    
X = np.array(X)
y = np.array(y)

In [27]:
print("Data: ", X[:10])
print("Response: ", y[:10])

Data:  [[  1  22  21]
 [ 22  21 180]
 [ 21 180   5]
 [180   5  10]
 [  5  10   7]
 [ 10   7 277]
 [  7 277  37]
 [277  37 554]
 [ 37 554 555]
 [554 555  19]]
Response:  [180   5  10   7 277  37 554 555  19 180]


In [28]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [29]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=3))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

In [30]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 3, 10)             20800     
                                                                 
 lstm_2 (LSTM)               (None, 3, 1000)           4044000   
                                                                 
 lstm_3 (LSTM)               (None, 1000)              8004000   
                                                                 
 dense_2 (Dense)             (None, 1000)              1001000   
                                                                 
 dense_3 (Dense)             (None, 2080)              2082080   
                                                                 
Total params: 15,151,880
Trainable params: 15,151,880
Non-trainable params: 0
_________________________________________________________________


In [31]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint("next_words.h5", monitor='loss', verbose=1, save_best_only=True)
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))
model.fit(X, y, epochs=70, batch_size=64, callbacks=[checkpoint])

Epoch 1/70
Epoch 1: loss improved from inf to 6.45878, saving model to next_words.h5
Epoch 2/70
Epoch 2: loss improved from 6.45878 to 6.11604, saving model to next_words.h5
Epoch 3/70
Epoch 3: loss improved from 6.11604 to 5.81521, saving model to next_words.h5
Epoch 4/70
Epoch 4: loss improved from 5.81521 to 5.49749, saving model to next_words.h5
Epoch 5/70
Epoch 5: loss improved from 5.49749 to 5.24859, saving model to next_words.h5
Epoch 6/70
Epoch 6: loss improved from 5.24859 to 5.02443, saving model to next_words.h5
Epoch 7/70
Epoch 7: loss improved from 5.02443 to 4.82184, saving model to next_words.h5
Epoch 8/70
Epoch 8: loss improved from 4.82184 to 4.61145, saving model to next_words.h5
Epoch 9/70
Epoch 9: loss improved from 4.61145 to 4.41268, saving model to next_words.h5
Epoch 10/70
Epoch 10: loss improved from 4.41268 to 4.20562, saving model to next_words.h5
Epoch 11/70
Epoch 11: loss improved from 4.20562 to 3.98642, saving model to next_words.h5
Epoch 12/70
Epoch 12:

<keras.callbacks.History at 0x2b8d8518b20>

In [32]:
from tensorflow.keras.models import load_model
import numpy as np
import pickle

# Load the model and tokenizer
model = load_model('next_words.h5')
tokenizer = pickle.load(open('token.pkl', 'rb'))

def Predict_Next_Words(model, tokenizer, text):

  sequence = tokenizer.texts_to_sequences([text])
  sequence = np.array(sequence)
  preds = np.argmax(model.predict(sequence))
  predicted_word = ""
  
  for key, value in tokenizer.word_index.items():
      if value == preds:
          predicted_word = key
          break
  
  print(predicted_word)
  return predicted_word

In [33]:
while(True):
  text = input("Enter your line: ")
  
  if text == "0":
      print("Execution completed.....")
      break
  
  else:
      try:
          text = text.split(" ")
          text = text[-3:]
          print(text)
        
          Predict_Next_Words(model, tokenizer, text)
          
      except Exception as e:
        print("Error occurred: ",e)
        continue

Enter your line:  Alice was beginning
['Alice', 'was', 'beginning']
to
Enter your line: no pictures or 
['pictures', 'or', '']
Error occurred:  in user code:

    File "C:\Users\admin\anaconda3\lib\site-packages\keras\engine\training.py", line 1801, in predict_function  *
        return step_function(self, iterator)
    File "C:\Users\admin\anaconda3\lib\site-packages\keras\engine\training.py", line 1790, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\admin\anaconda3\lib\site-packages\keras\engine\training.py", line 1783, in run_step  **
        outputs = model.predict_step(data)
    File "C:\Users\admin\anaconda3\lib\site-packages\keras\engine\training.py", line 1751, in predict_step
        return self(x, training=False)
    File "C:\Users\admin\anaconda3\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\admin\anaconda3\l