In [2]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import regex as re

In [3]:
#path for the file
file_path="pizza.txt"

#reading the file
with open (file_path,"r") as file:
    text=file.read()

    #spliting the data input according to the regex provided
    data=[sentence.strip() for sentence in re.split(r'(?<=[.!?]\s+)',text) if sentence.strip()]
# print(data)


In [4]:
#making an instance of the tokenizer
# tokenizer is used to map each unique word into unique integer
tokenizer=Tokenizer()

#scans through all the texts and keeps track of the frequency of each word in the texts
tokenizer.fit_on_texts(data)
# +1 is to act as a place holder for out-of-vocabulary
total_words=len(tokenizer.word_index)+1
# print(total_words)

In [5]:
input_sequences=[]
#making n-gram list for input text
for line in data:
    #converting the line by line into tokens
    token_list=tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        #making n-gram sequence
        #eg if the line is [1,2,3,4,5] then the n-gram sequence will be [1,2],[1,2,3],[1,2,3,4],[1,2,3,4,5]
        n_gram_sequence=token_list[:i+1]
        input_sequences.append(n_gram_sequence)
# print(input_sequences)

In [6]:
max_sequence_len=max([len(seq) for seq in input_sequences],default=0)
# padding the sequence to take the every input to the same length
input_sequences=np.array(pad_sequences(input_sequences,maxlen=max_sequence_len,padding="pre"))

# choosing X,y
# X = choosing all the list inside the list except the last item of each list
# y = choosing all the last item of the each list
# by this we can predict the next word(Integer) using the sequence of words (Integers)
X,y=input_sequences[:,:-1],input_sequences[:,-1]

In [7]:
#one hot encoding the y
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [8]:
model=Sequential()
#adding a embedding layer to the sequendial model
#translates high-dimensional data (like one-hot encoded vectors) into a lower-dimensional space.
#here we are converting the total words into 10 dimensional vector
model.add(Embedding(total_words,10,input_length=max_sequence_len-1))
#adding Long short term memory(RNN) to the model with 128 neurons
model.add(LSTM(128))
#adding dense(Fully connected layer to the model)
model.add(Dense(total_words,activation="softmax"))
#
model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=["accuracy"])

In [9]:

# Train the model
model.fit(X, y, epochs=500, verbose=1)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.callbacks.History at 0x268e814f9a0>

In [13]:

# Generate next word predictions
seed_text = "Pizza have different "
next_words = 10
 
for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences(
        [token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted_probs = model.predict(token_list)
    predicted_word = tokenizer.index_word[np.argmax(predicted_probs)]
    seed_text += " " + predicted_word
 
print("Next predicted words:", seed_text)

Next predicted words: Pizza have different  become a significant role in shaping the future of pizza
