<a href="https://colab.research.google.com/github/v-enigma/DL_LabExperiments/blob/main/lab_9_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Predict the next word in a sentence using an RNN. Consider the following sentence
dataset:
The cat sat on the mat.
The dog sat on the rug.
The bird flew in the sky.
The cat jumped over the fence.
And predict “The cat sat on __-“
Follow the following steps:
1- Text Preprocessing: tokenize the sentences and convert the words into numerical
representations (i.e., using integer encoding).
2- Model Building: build a simple RNN model using Keras/TensorFlow.
3- Training the Model: train the RNN to predict the next word given the previous words in
the sentence.
4- Prediction: use the trained model to predict the next word in a sentence.

In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Step 1: Text Preprocessing
# Define our dataset
sentences = [
    "The cat sat on the mat",
    "The dog sat on the rug",
    "The bird flew in the sky",
    "The cat jumped over the fence"
]

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
total_words = len(tokenizer.word_index) + 1
print(f"Total unique words: {total_words}")
print(f"Word index: {tokenizer.word_index}")

# Create input sequences
input_sequences = []
for sentence in sentences:
    token_list = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Pad sequences
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

# Create predictors and labels
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

# Convert y to one-hot encoding
from tensorflow.keras.utils import to_categorical
y = to_categorical(y, num_classes=total_words)

# Step 2: Model Building
# Define model parameters
embedding_dim = 10
input_length = max_sequence_len - 1

# Build the model
model = Sequential()
model.add(Embedding(total_words, embedding_dim, input_length=input_length))
model.add(SimpleRNN(32))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# Step 3: Training the Model
history = model.fit(X, y, epochs=100, verbose=1)

# Step 4: Prediction
# Test with "The cat sat on"
test_text = "The cat sat on"
token_list = tokenizer.texts_to_sequences([test_text])[0]
token_list = pad_sequences([token_list], maxlen=input_length, padding='pre')

# Generate predictions
predicted = model.predict(token_list, verbose=0)
predicted_word_index = np.argmax(predicted, axis=1)[0]

# Get the actual word
for word, index in tokenizer.word_index.items():
    if index == predicted_word_index:
        print(f"\nInput text: '{test_text}'")
        print(f"Predicted next word: '{word}'")
        break

# Analysis of all possible predictions
print("\nAll word probabilities:")
predictions = predicted[0]
word_predictions = [(tokenizer.index_word.get(i), predictions[i]) for i in range(1, len(predictions)) if i in tokenizer.index_word]
word_predictions.sort(key=lambda x: x[1], reverse=True)

# Display top 3 predictions
for word, prob in word_predictions[:3]:
    print(f"'{word}': {prob:.4f}")

Total unique words: 15
Word index: {'the': 1, 'cat': 2, 'sat': 3, 'on': 4, 'mat': 5, 'dog': 6, 'rug': 7, 'bird': 8, 'flew': 9, 'in': 10, 'sky': 11, 'jumped': 12, 'over': 13, 'fence': 14}




None
Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.0000e+00 - loss: 2.7201
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.0000e+00 - loss: 2.7085
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.0500 - loss: 2.6972
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.2000 - loss: 2.6859
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.2000 - loss: 2.6747
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step - accuracy: 0.2500 - loss: 2.6634
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.2500 - loss: 2.6520
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - accuracy: 0.3000 - loss: 2.6404
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━