In [24]:
# -*- coding: utf-8 -*-
"""l01c01_introduction_to_colab_and_python.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/github/tensorflow/examples/blob/master/courses/udacity_intro_to_tensorflow_for_deep_learning/l01c01_introduction_to_colab_and_python.ipynb
"""

# =========================
# LSTM NEXT WORD PREDICTION
# GOOGLE COLAB CODE
# =========================

import tensorflow as tf
import numpy as np
import time
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical

# =========================
# DATASET
# =========================

faqs = """About the Program
What is the course fee for Data Science Mentorship Program
The course follows a monthly subscription model
The total duration of the course is 7 months
What is the syllabus of the mentorship program
Python Fundamentals
Python libraries for Data Science
Data Analysis
SQL for Data Science
Maths for Machine Learning
ML Algorithms
Practical ML
Will Deep Learning and NLP be a part of this program
No NLP and Deep Learning are not included
What if I miss a live session
Yes all sessions are recorded
What is the language spoken by the instructor
Hinglish
"""

# =========================
# TOKENIZATION
# =========================

tokenizer = Tokenizer()
tokenizer.fit_on_texts([faqs])

input_sequences = []
for line in faqs.split("\n"):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        input_sequences.append(token_list[:i+1])

max_len = max(len(x) for x in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_len, padding='pre')

X = input_sequences[:, :-1]
y = input_sequences[:, -1]

vocab_size = len(tokenizer.word_index) + 1
y = to_categorical(y, num_classes=vocab_size)

# =========================
# MODEL
# =========================

model = Sequential([
    Embedding(vocab_size, 100, input_length=max_len-1),
    LSTM(150, return_sequences=True),
    LSTM(150),
    Dense(vocab_size, activation='softmax')
])

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

# =========================
# TRAIN MODEL
# =========================

model.fit(X, y, epochs=100, verbose=1)

# ======================================================
# QUESTION 1: PREDICT NEXT WORD FOR GIVEN SENTENCE
# ======================================================

print("\n--- Question 1: Single Next Word Prediction ---")

input_text = "what is the course"
token_list = tokenizer.texts_to_sequences([input_text])[0]
token_list = pad_sequences([token_list], maxlen=max_len-1, padding='pre')

prediction = model.predict(token_list, verbose=0)
predicted_index = np.argmax(prediction)

for word, index in tokenizer.word_index.items():
    if index == predicted_index:
        print("Input:", input_text)
        print("Predicted Next Word:", word)
        break

# ======================================================
# QUESTION 2 & 3: GENERATE 5 WORDS SEQUENTIALLY
# ======================================================

print("\n--- Question 2 & 3: Generate 5 Words ---")

text = "what is the course"
print("Starting Text:", text)

for i in range(5):
    token_list = tokenizer.texts_to_sequences([text])[0]
    token_list = pad_sequences([token_list], maxlen=max_len-1, padding='pre')

    prediction = model.predict(token_list, verbose=0)
    predicted_index = np.argmax(prediction)

    for word, index in tokenizer.word_index.items():
        if index == predicted_index:
            text += " " + word
            break

    print(f"Step {i+1}:", text)
    time.sleep(1)



Epoch 1/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.0328 - loss: 4.0772  
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.0906 - loss: 4.0554
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.0750 - loss: 4.0160
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.0945 - loss: 3.9076
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.0867 - loss: 3.8194
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.0984 - loss: 3.7787
Epoch 7/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - accuracy: 0.1023 - loss: 3.7047
Epoch 8/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.0930 - loss: 3.7644
Epoch 9/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m