<a href="https://colab.research.google.com/github/vineetdave/LangChainTutorials/blob/main/AdvancedLSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Cell 1: Import Libraries
# We're adding new tools for a more robust workflow
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore', category=Warning)

In [None]:
#-------------------------------------------------------------------------------
# Cell 2: Define Sample Data (Now with more text!)
# A model needs more data to learn patterns, so we've replaced
# the tiny story with a larger excerpt.
data = """
Alice was beginning to get very tired of sitting by her sister on the bank,
and of having nothing to do: once or twice she had peeped into the
book her sister was reading, but it had no pictures or conversations in
it, 'and what is the use of a book,' thought Alice 'without pictures or
conversations?' So she was considering in her own mind (as well as she
could, for the hot day made her feel very sleepy and stupid), whether
the pleasure of making a daisy-chain would be worth the trouble of
getting up and picking the daisies, when suddenly a White Rabbit with
pink eyes ran close by her.

There was nothing so very remarkable in that; nor did Alice
think it so very much out of the way to hear the Rabbit say to
itself, 'Oh dear! Oh dear! I shall be late!' (when she thought
it over afterwards, it occurred to her that she ought to have
wondered at this, but at the time it all seemed quite natural);
but when the Rabbit actually took a watch out of its waistcoat-pocket,
and looked at it, and then hurried on, Alice started to her feet,
for it flashed across her mind that she had never before seen a
rabbit with either a waistcoat-pocket, or a watch to take out of it,
and burning with curiosity, she ran across the field after it,
and fortunately was just in time to see it pop down a large
rabbit-hole under the hedge.
"""

print("Data loaded (now with more text!).")

In [None]:
#-------------------------------------------------------------------------------
# Cell 3: Tokenize the Text
# This step is the same, but the tokenizer will now find more unique words.
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
vocab_size = len(tokenizer.word_index) + 1
encoded_text = tokenizer.texts_to_sequences([data])[0]

print(f"Total unique words (vocab size): {vocab_size}")

In [None]:
#-------------------------------------------------------------------------------
# Cell 4: Create Input Sequences and Targets (Longer Sequence)
# We're increasing the seq_length to 10. This gives the model
# more context to learn from.
sequences = []
seq_length = 10  # Increased from 5 to 10

for i in range(seq_length, len(encoded_text)):
    seq = encoded_text[i-seq_length:i]
    label = encoded_text[i]
    sequences.append((seq, label))

print(f"Total number of sequences created: {len(sequences)}")

In [None]:
#-------------------------------------------------------------------------------
# Cell 5: Prepare Data for Keras (with Validation Split)
# This is a major improvement. We split our data into a training set
# (for learning) and a validation set (for testing). This lets us
# see if the model is just memorizing or actually learning.

X, y = zip(*sequences)
X = np.array(X)
y = to_categorical(y, num_classes=vocab_size)

# Split the data: 80% for training, 20% for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Shape of X_train (inputs): {X_train.shape}")
print(f"Shape of y_train (targets): {y_train.shape}")
print(f"Shape of X_val (validation inputs): {X_val.shape}")
print(f"Shape of y_val (validation targets): {y_val.shape}")

In [None]:
#-------------------------------------------------------------------------------
# Cell 6: Define the Improved Model Architecture
# We are now building a deeper, more robust model by:
# 1. Stacking two LSTM layers.
# 2. Adding Dropout layers to prevent overfitting.

embedding_dim = 100  # Increased dimensions for a richer representation
lstm_units = 150     # Increased memory units

model = Sequential()

# Embedding Layer
model.add(Embedding(input_dim=vocab_size,
                    output_dim=embedding_dim,
                    input_length=seq_length))
# Dropout Layer: Randomly "turns off" 20% of neurons to prevent memorization
model.add(Dropout(0.2))

# Layer 1: Stacked LSTM
# We set return_sequences=True so it passes its full output sequence
# to the next LSTM layer, not just the final summary.
model.add(LSTM(lstm_units, return_sequences=True))
model.add(Dropout(0.2))

# Layer 2: Final LSTM layer
# This one returns only the final summary vector.
model.add(LSTM(lstm_units))

# Output Layer (same as before, but now fed by a more powerful model)
model.add(Dense(vocab_size, activation='softmax'))

model.summary()

In [None]:

#-------------------------------------------------------------------------------
# Cell 7: Compile and Train the Model (with Early Stopping)
# We add two major improvements here:
# 1. validation_data: The model will test itself against the unseen
#    validation set after each epoch.
# 2. callbacks: We add EarlyStopping to automatically stop training
#    when the model's validation score stops improving.

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# This callback will monitor the 'val_loss' (validation loss).
# It will stop training if the val_loss doesn't improve for 10 epochs (patience=10).
# It will also restore the best version of the model it found.
early_stopper = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True,
    verbose=1
)

print("Starting enhanced model training...")
# We can set epochs high (e.g., 500) because EarlyStopping will find
# the "best" epoch and stop automatically.
model.fit(
    X_train,
    y_train,
    epochs=500,
    batch_size=8,
    validation_data=(X_val, y_val),  # Pass in the validation set
    callbacks=[early_stopper],       # Add the early stopping callback
    verbose=2
)
print("Model training complete.")


In [None]:
#-------------------------------------------------------------------------------
# Cell 8: Define the Text Generation Function
# This function is identical to the one before.
def generate_text(seed_text, n_words):
    generated_text = seed_text
    current_text = seed_text

    int_to_word = {v: k for k, v in tokenizer.word_index.items()}

    for _ in range(n_words):
        encoded = tokenizer.texts_to_sequences([current_text])[0]
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')

        y_pred_probs = model.predict(encoded, verbose=0)[0]
        y_pred_index = np.argmax(y_pred_probs)

        out_word = int_to_word.get(y_pred_index, '?')

        current_text += " " + out_word
        generated_text += " " + out_word

    return generated_text

print("Text generation function defined.")

In [None]:
#-------------------------------------------------------------------------------
# Cell 9: Generate New Text
# Now we test our new, more robust model. The generated text
# should (hopefully) be more coherent.

seed_text = "the knight was very brave and"
generated = generate_text(seed_text, 30) # Generate 30 new words

print("\n--- SEED TEXT ---")
print(seed_text)
print("\n--- GENERATED TEXT (from Enhanced LSTM) ---")
print(generated)