# Next Word Prediction:

### Importing The Required Libraries:

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

In [2]:
"""
    Dataset: http://www.gutenberg.org/cache/epub/5200/pg5200.txt
    Remove all the unnecessary data and label it as Metamorphosis-clean.
    The starting and ending lines should be as follows.

"""


# file = open("metamorphosis_clean.txt", "r", encoding = "utf8")
file2 = open("conv.txt", "r", encoding="utf8")
lines = []

# for i in file:
#    lines.append(i)
for i in file2:
    lines.append(i)
    
print("The First Line: ", lines[0])
print("The Last Line: ", lines[-1])

The First Line:  Introductions

The Last Line:  



### Cleaning the data:

In [3]:
data = ""

for i in lines:
    data = ' '. join(lines)
    
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '')
data[:360]

"Introductions Hello. My name’s Peter. What’s your name? My name is Where are you from Janet? I am from Seattle. Where are you from? I am from Madrid. Are you American Yes, I am. Are you Spanish? Yes I am. Hello and goodbye - three short CONVERSATIONS hello Hello, Peter. How are you? Fine, thanks. How are you? I'm fine, thank you. Goodbye Goodbye, Janet See y"

In [4]:
import string

translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) #map punctuation to space
new_data = data.translate(translator)

new_data[:500]

'Introductions Hello  My name’s Peter  What’s your name  My name is Where are you from Janet  I am from Seattle  Where are you from  I am from Madrid  Are you American Yes  I am  Are you Spanish  Yes I am  Hello and goodbye   three short CONVERSATIONS hello Hello  Peter  How are you  Fine  thanks  How are you  I m fine  thank you  Goodbye Goodbye  Janet See you tomorrow  Bye bye  Have a nice evening  Thanks  you too  Thanks  What time is it  Excuse me  Can you tell me the time  please  Yes  of co'

In [5]:
z = []

for i in data.split():
    if i not in z:
        z.append(i)
        
data = ' '.join(z)
data[:500]

"Introductions Hello. My name’s Peter. What’s your name? name is Where are you from Janet? I am Seattle. from? Madrid. Are American Yes, am. Spanish? Yes Hello and goodbye - three short CONVERSATIONS hello Hello, How you? Fine, thanks. I'm fine, thank you. Goodbye Goodbye, Janet See tomorrow! Bye bye. Have a nice evening. Thanks, too! Thanks. What time it? Excuse me. Can tell me the time, please? of course. It's seven o’clock. Thank No problem. It’s half rupees three. You're welcome. Shopping for"

### Tokenization:

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

# saving the tokenizer for predict function.
pickle.dump(tokenizer, open('tokenizer1.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:10]

[1493, 215, 216, 1494, 398, 761, 399, 217, 217, 46]

In [7]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

3803


In [8]:
sequences = []

for i in range(1, len(sequence_data)):
    words = sequence_data[i-1:i+1]
    sequences.append(words)
    
print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]

The Length of sequences are:  6876


array([[1493,  215],
       [ 215,  216],
       [ 216, 1494],
       [1494,  398],
       [ 398,  761],
       [ 761,  399],
       [ 399,  217],
       [ 217,  217],
       [ 217,   46],
       [  46,  104]])

In [9]:
X = []
y = []

for i in sequences:
    X.append(i[0])
    y.append(i[1])
    
X = np.array(X)
y = np.array(y)

In [10]:
print("The Data is: ", X[:5])
print("The responses are: ", y[:5])

The Data is:  [1493  215  216 1494  398]
The responses are:  [ 215  216 1494  398  761]


In [11]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

### Creating the Model:

In [12]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1, 10)             38030     
_________________________________________________________________
lstm (LSTM)                  (None, 1, 1000)           4044000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 1000)              8004000   
_________________________________________________________________
dense (Dense)                (None, 1000)              1001000   
_________________________________________________________________
dense_1 (Dense)              (None, 3803)              3806803   
Total params: 16,893,833
Trainable params: 16,893,833
Non-trainable params: 0
_________________________________________________________________


### Plot The Model:

In [14]:
from tensorflow import keras
from keras.utils.vis_utils import plot_model

keras.utils.plot_model(model, to_file='model.png', show_layer_names=True)

('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')


### Callbacks:

In [15]:
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import TensorBoard

checkpoint = ModelCheckpoint("nextword1.h5", monitor='loss', verbose=1,
    save_best_only=True, mode='auto')

reduce = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=3, min_lr=0.0001, verbose = 1)

logdir='logsnextword1'
tensorboard_Visualization = TensorBoard(log_dir=logdir)

### Compile The Model:

In [16]:
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))

### Fit The Model:

In [17]:
%time
model.fit(X, y, epochs=150, batch_size=64, callbacks=[checkpoint, reduce, tensorboard_Visualization])

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.96 µs
Epoch 1/150

Epoch 00001: loss improved from inf to 8.24852, saving model to nextword1.h5
Epoch 2/150

Epoch 00002: loss improved from 8.24852 to 8.19774, saving model to nextword1.h5
Epoch 3/150

Epoch 00003: loss improved from 8.19774 to 8.06041, saving model to nextword1.h5
Epoch 4/150

Epoch 00004: loss improved from 8.06041 to 7.81136, saving model to nextword1.h5
Epoch 5/150

Epoch 00005: loss improved from 7.81136 to 7.49432, saving model to nextword1.h5
Epoch 6/150

Epoch 00006: loss improved from 7.49432 to 7.18081, saving model to nextword1.h5
Epoch 7/150

Epoch 00007: loss improved from 7.18081 to 6.91833, saving model to nextword1.h5
Epoch 8/150

Epoch 00008: loss improved from 6.91833 to 6.68395, saving model to nextword1.h5
Epoch 9/150

Epoch 00009: loss improved from 6.68395 to 6.44893, saving model to nextword1.h5
Epoch 10/150

Epoch 00010: loss improved from 6.44893 to 6.21033, saving model to nextword1.h5

Epoch 50/150

Epoch 00050: loss improved from 2.34258 to 2.29906, saving model to nextword1.h5
Epoch 51/150

Epoch 00051: loss improved from 2.29906 to 2.26469, saving model to nextword1.h5
Epoch 52/150

Epoch 00052: loss improved from 2.26469 to 2.21732, saving model to nextword1.h5
Epoch 53/150

Epoch 00053: loss improved from 2.21732 to 2.19651, saving model to nextword1.h5
Epoch 54/150

Epoch 00054: loss improved from 2.19651 to 2.15227, saving model to nextword1.h5
Epoch 55/150

Epoch 00055: loss improved from 2.15227 to 2.11294, saving model to nextword1.h5
Epoch 56/150

Epoch 00056: loss improved from 2.11294 to 2.07672, saving model to nextword1.h5
Epoch 57/150

Epoch 00057: loss improved from 2.07672 to 2.03787, saving model to nextword1.h5
Epoch 58/150

Epoch 00058: loss improved from 2.03787 to 1.99079, saving model to nextword1.h5
Epoch 59/150

Epoch 00059: loss improved from 1.99079 to 1.95772, saving model to nextword1.h5
Epoch 60/150

Epoch 00060: loss improved from 1.95

Epoch 101/150

Epoch 00101: loss improved from 1.42462 to 1.40966, saving model to nextword1.h5
Epoch 102/150

Epoch 00102: loss improved from 1.40966 to 1.40445, saving model to nextword1.h5
Epoch 103/150

Epoch 00103: loss improved from 1.40445 to 1.39286, saving model to nextword1.h5
Epoch 104/150

Epoch 00104: loss improved from 1.39286 to 1.37595, saving model to nextword1.h5
Epoch 105/150

Epoch 00105: loss improved from 1.37595 to 1.36039, saving model to nextword1.h5
Epoch 106/150

Epoch 00106: loss did not improve from 1.36039
Epoch 107/150

Epoch 00107: loss improved from 1.36039 to 1.35335, saving model to nextword1.h5
Epoch 108/150

Epoch 00108: loss improved from 1.35335 to 1.35323, saving model to nextword1.h5
Epoch 109/150

Epoch 00109: loss did not improve from 1.35323
Epoch 110/150

Epoch 00110: loss did not improve from 1.35323
Epoch 111/150

Epoch 00111: loss did not improve from 1.35323

Epoch 00111: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026

<tensorflow.python.keras.callbacks.History at 0x17d0d3370>