In [8]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import regex as re

In [None]:
#path for the file
file_path="pizza.txt"

#reading the file
with open (file_path,"r") as file:
    text=file.read()

    #spliting the data input according to the regex provided
    data=[sentence.strip() for sentence in re.split(r'(?<=[.!?]\s+)',text) if sentence.strip()]
# print(data)


In [None]:
#making an instance of the tokenizer
# tokenizer is used to map each unique word into uniqye integer
tokenizer=Tokenizer()

#scans through all the texts and keeps track of the frequency of each word in the texts
tokenizer.fit_on_texts(data)
# +1 is to act as a place holder for out-of-vocabulary
total_words=len(tokenizer.word_index)+1
# print(total_words)

In [None]:
input_sequences=[]
#making n-gram list for input text
for line in data:
    token_list=tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_sequence=token_list[:i+1]
        input_sequences.append(n_gram_sequence)
# print(input_sequences)

In [22]:
max_sequence_len=max([len(seq) for seq in input_sequences],default=0)
# padding the sequence to take the every input to the same length
input_sequences=np.array(pad_sequences(input_sequences,maxlen=max_sequence_len,padding="pre"))

# choosing X,y
# X = choosing all the list inside the list except the last item of each list
# y = choosing all the last item of the each list
# by this we can predict the next word(Integer) using the sequence of words (Integers)
X,y=input_sequences[:,:-1],input_sequences[:,-1]

In [23]:
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [24]:
model=Sequential()
#adding a embedding layer to the sequendial model
#translates high-dimensional data (like one-hot encoded vectors) into a lower-dimensional space.
model.add(Embedding(total_words,10,input_length=max_sequence_len-1))
#adding Long short term memory(RNN) to the model with 128 neurons
model.add(LSTM(128))
#adding dense(Fully connected layer to the model)
model.add(Dense(total_words,activation="softmax"))
#
model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=["accuracy"])