### Importing all Required Libraries

In [2]:
import numpy as np
from nltk.tokenize import RegexpTokenizer
from keras.models import Sequential, load_model
from keras.layers import LSTM
from keras.layers.core import Dense, Activation
from tensorflow.keras.optimizers import RMSprop
import matplotlib.pyplot as plt
import pickle
import heapq

### Loading the dataset from system

In [3]:
text_file = open("1661-0.txt", encoding='UTF-8').read().lower() #reading the text file 

### Storing relevant data only (for better accuracy)

In [4]:
txtf=text_file[1273:] #storing the relevant text by slicing the about intro,preface,cover,index details from the dataset 

In [5]:
print('Length of text present in file :', len(txtf))

Length of text present in file : 580615


### Splitting the dataset into single words in order 

In [6]:
tokenizer=RegexpTokenizer(r'\w+')
wrds = tokenizer.tokenize(txtf)

In [7]:
#print(wrds)

### Making a dictionary containing every word in the data

In [8]:
uq_wrds= np.unique(wrds) #storing all unique words inside a list
uq_wrd_index= dict((j,i) for i,j in enumerate(uq_wrds)) #creating a dictionary with keys(list of unique words) and values(all words present in the dataset)

### Feature Engineering

In [9]:
wrd_len=5
prev_wrds=[]
next_wrds=[]
for i in range(len(wrds)-wrd_len):
    prev_wrds.append(wrds[i:(i+wrd_len)])
    next_wrds.append(wrds[i+wrd_len])
print(prev_wrds[0])
print('\n',next_wrds[0])

['to', 'sherlock', 'holmes', 'she', 'is']

 always


In [10]:
#creating a array to store the features
X = np.zeros((len(prev_wrds), wrd_len, len(uq_wrds)), dtype=bool)

In [11]:
#creating another array to store the corresponding label
Y = np.zeros((len(next_wrds), len(uq_wrds)), dtype=bool)

In [12]:
#storing 
for i, each_wrds in enumerate(prev_wrds):
    for j, each_wrds in enumerate(each_wrds):
        X[i, j, uq_wrd_index[each_wrds]] = 1
    Y[i, uq_wrd_index[next_wrds[i]]] = 1

In [13]:
print(X[0][0])  #a look at a single sequence of words

[False False False ... False False False]


### Bulding model

In [14]:
model=Sequential()
model.add(LSTM(128, input_shape=(wrd_len, len(uq_wrds))))  #using LSTM model, a powerful Recurrent Neural Network(RNN).
model.add(Dense(len(uq_wrds)))
model.add(Activation('softmax'))

### Training the model

In [23]:
optimizer=RMSprop(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
history= model.fit(X, Y, validation_split=0.05, batch_size=128, epochs=5, shuffle=True).history

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


### Saving the model for future use

In [24]:
model.save('keras_next_word_model.h5')
pickle.dump(history, open("history.p", "wb"))
model= load_model('keras_next_word_model.h5')
history= pickle.load(open("history.p", "rb"))

### Testing the Model

In [119]:
def prepare_input(txtf):
    x=np.zeros((1, wrd_len, len(uq_wrds)))
    for t, word in enumerate(txtf.split()):
        print(word)
        x[0, t, uq_wrd_index[word]] = 1
    return x
prepare_input("That which does not give".lower())

that
which
does
not
give


array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]])

In [120]:
#funtion for returning samples
def sample(preds, top_n=3): 
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)

    return heapq.nlargest(top_n, range(len(preds)), preds.take)

In [121]:
#funtion for prediction model
def predict_completions(txt,n=3):
    if txt=="":
        return("0")
    x=prepare_input(txt)
    preds=model.predict(x, verbose=0)[0]
    next_indices=sample(preds,n)
    return [uq_wrds[idx] for idx in next_indices]

### Sample Result

In [122]:
sent = input("Enter a sentence:- ")
seq = " ".join(tokenizer.tokenize(sent.lower())[0:5])  #using only first 5 words of the input
pc=predict_completions(seq, 10)  #creating a list of 10 predicted words using the model
print("Next predicted words:",pc)

Enter a sentence:- He was clearly so scared by his mischance in breaking the window
he
was
clearly
so
scared
Next predicted words: ['that', 'and', 'by', 'to', 'with', 'for', 'of', 'then', 'but', 'so']


#### Word 'by' is after word 'scared' in the input and the list of predicted words, word 'by' is present at 3rd position.
#### So, we can say that our model gave a correct prediction.