In [3]:
import pandas as pd
import re

In [7]:
!pip install regex

Collecting regex
  Downloading regex-2024.5.15-cp311-cp311-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 0.0/42.0 kB ? eta -:--:--
     ------------------ ------------------- 20.5/42.0 kB 330.3 kB/s eta 0:00:01
     -------------------------------------- 42.0/42.0 kB 503.7 kB/s eta 0:00:00
Downloading regex-2024.5.15-cp311-cp311-win_amd64.whl (268 kB)
   ---------------------------------------- 0.0/269.0 kB ? eta -:--:--
   ---------------------------- ---------- 194.6/269.0 kB 11.5 MB/s eta 0:00:01
   ---------------------------------------- 269.0/269.0 kB 8.3 MB/s eta 0:00:00
Installing collected packages: regex
Successfully installed regex-2024.5.15


In [8]:
import tensorflow as tf 
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Embedding, LSTM, Dense 
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences 
import numpy as np 
import regex as re 

In [9]:
def file_to_sentence_list(file_path): 
    with open(file_path, 'r') as file: 
        text = file.read() 
  
    # Splitting the text into sentences using 
    # delimiters like '.', '?', and '!' 
    sentences = [sentence.strip() for sentence in re.split( 
        r'(?<=[.!?])\s+', text) if sentence.strip()] 
  
    return sentences 
  
file_path = 'pizza.txt'
text_data = file_to_sentence_list(file_path) 

In [10]:
# Tokenize the text data 
tokenizer = Tokenizer() 
tokenizer.fit_on_texts(text_data) 
total_words = len(tokenizer.word_index) + 1

In [11]:
# Create input sequences 
input_sequences = [] 
for line in text_data: 
    token_list = tokenizer.texts_to_sequences([line])[0] 
    for i in range(1, len(token_list)): 
        n_gram_sequence = token_list[:i+1] 
        input_sequences.append(n_gram_sequence) 
  
# Pad sequences and split into predictors and label 
max_sequence_len = max([len(seq) for seq in input_sequences]) 
input_sequences = np.array(pad_sequences( 
    input_sequences, maxlen=max_sequence_len, padding='pre')) 
X, y = input_sequences[:, :-1], input_sequences[:, -1] 
  
# Convert target data to one-hot encoding 
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [12]:

# Define the model 
model = Sequential() 
model.add(Embedding(total_words, 10, 
                    input_length=max_sequence_len-1)) 
model.add(LSTM(128)) 
model.add(Dense(total_words, activation='softmax')) 
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', metrics=['accuracy']) 



In [13]:
# Train the model 
model.fit(X, y, epochs=500, verbose=1) 

Epoch 1/500
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - accuracy: 0.0390 - loss: 6.3335
Epoch 2/500
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.0460 - loss: 5.7104
Epoch 3/500
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.0653 - loss: 5.6448
Epoch 4/500
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.0463 - loss: 5.6801
Epoch 5/500
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.0457 - loss: 5.6576
Epoch 6/500
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.0616 - loss: 5.6200
Epoch 7/500
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.0524 - loss: 5.6135
Epoch 8/500
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.0621 - loss: 5.4530
Epoch 9/500
[1m51/51[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x1c41221f950>

In [14]:
input_sequences

array([[  0,   0,   0, ...,   0,   3,   1],
       [  0,   0,   0, ...,   3,   1, 233],
       [  0,   0,   0, ...,   1, 233,   2],
       ...,
       [  0,   0,   0, ..., 685,   4,  19],
       [  0,   0,   0, ...,   4,  19,  72],
       [  0,   0,   0, ...,  19,  72, 686]])

In [15]:
seed_text = "The world"
next_words = 25
ouptut_text = "" 
  
for _ in range(next_words): 
    token_list = tokenizer.texts_to_sequences([seed_text])[0] 
    token_list = pad_sequences( 
        [token_list], maxlen=max_sequence_len-1, 
      padding='pre') 
    predicted = np.argmax(model.predict(token_list,  
                                        verbose=0), axis=-1) 
    output_word = "" 
      
    for word, index in tokenizer.word_index.items(): 
        if index == predicted: 
            output_word = word 
            break
              
    seed_text += " " + output_word 
      
print(seed_text)

The world of pizza has created a market that supports not only traditional pizza establishments but also various related industries including food delivery services pizza ingredient suppliers
