In [2]:
# -------------------------------
# LAB 8: LSTM Next Word Prediction
# -------------------------------

import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# -------------------------------
# 1) Text Dataset (FAQs)
# -------------------------------
faqs = """About the Program
What is the course fee for  Data Science Mentorship Program (DSMP 2023)
The course follows a monthly subscription model where you have to make monthly payments of Rs 799/month.
What is the total duration of the course?
... (use full text from your description) ...
Discussion on Job hunting strategies
"""

# -------------------------------
# 2) Tokenize Text
# -------------------------------
tokenizer = Tokenizer()
tokenizer.fit_on_texts([faqs])

# Create input sequences for LSTM
input_sequences = []
for sentence in faqs.split('\n'):
    tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(tokenized_sentence)):
        input_sequences.append(tokenized_sentence[:i+1])

# Pad sequences
max_len = max([len(x) for x in input_sequences])
padded_input_sequences = pad_sequences(input_sequences, maxlen=max_len, padding='pre')

# Split X and y
X = padded_input_sequences[:, :-1]
y = padded_input_sequences[:, -1]

# One-hot encode y
y = tf.keras.utils.to_categorical(y, num_classes=len(tokenizer.word_index)+1)

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary size:", vocab_size)

# -------------------------------
# 3) Build LSTM Model
# -------------------------------
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_len-1))
model.add(LSTM(150, return_sequences=True))
model.add(LSTM(150))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.build(input_shape=(None, max_len-1))
model.summary()

# -------------------------------
# 4) Train LSTM Model
# -------------------------------
history = model.fit(X, y, epochs=100, verbose=1)

# -------------------------------
# 5) Function: Predict Next Word (1 word)
# -------------------------------
def predict_next_word(model, tokenizer, text_seq, max_len=max_len):
    tokenized = tokenizer.texts_to_sequences([text_seq])[0]
    padded = pad_sequences([tokenized], maxlen=max_len-1, padding='pre')
    predicted_index = np.argmax(model.predict(padded, verbose=0))
    for word, index in tokenizer.word_index.items():
        if index == predicted_index:
            return word

# -------------------------------
# 6) Example: Predict 1 Next Word
# -------------------------------
seed_text = "what is the fee"
next_word = predict_next_word(model, tokenizer, seed_text)
print(f"Next word prediction: '{next_word}'")

# -------------------------------
# 7) Generate 5 Words Sequentially
# -------------------------------
def generate_words(model, tokenizer, seed_text, n_words=5, max_len=max_len):
    text = seed_text
    for _ in range(n_words):
        word = predict_next_word(model, tokenizer, text, max_len)
        text += " " + word
    return text

generated_text = generate_words(model, tokenizer, seed_text, n_words=5)
print(f"\nGenerated sequence (5 words): {generated_text}")


Vocabulary size: 42




Epoch 1/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 70ms/step - accuracy: 0.0000e+00 - loss: 3.7393
Epoch 2/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step - accuracy: 0.1141 - loss: 3.7227
Epoch 3/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step - accuracy: 0.0892 - loss: 3.7024 
Epoch 4/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - accuracy: 0.0892 - loss: 3.6655
Epoch 5/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step - accuracy: 0.1141 - loss: 3.5859
Epoch 6/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - accuracy: 0.1037 - loss: 3.5456
Epoch 7/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - accuracy: 0.0996 - loss: 3.5140
Epoch 8/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step - accuracy: 0.1037 - loss: 3.4553
Epoch 9/100
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[