<a href="https://colab.research.google.com/github/venkatamogili/NLP/blob/main/Differentiating_LSTM_and_RNN_(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, LSTM, Dense, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import random

# Set Seeds for Reproducibility
tf.random.set_seed(42)
np.random.seed(42)
random.seed(42)

# Enhanced Dataset with more examples
sentences = [
    "The beginning of the movie was slow and boring, yet the final scenes were thrilling and exciting.",
    "While the film had great visuals, the storyline lacked depth and left the audience disappointed.",
    "The acting was phenomenal, and the movie had an emotional ending that left everyone in tears.",
    "Absolutely terrible movie with no redeeming qualities whatsoever.",
    "A masterpiece of cinema that everyone should watch.",
    "The plot was confusing and the pacing was awful.",
    "Brilliant performances and stunning cinematography throughout.",
    "Started strong but fell apart in the second half.",
    "An emotional rollercoaster that ends on a high note.",
    "Boring and predictable from start to finish."
]

labels = [1, 0, 1, 0, 1, 0, 1, 0, 1, 0]

# Tokenization
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

# Padding
max_len = max(len(seq) for seq in sequences)
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post')

X = np.array(padded_sequences, dtype=np.int32)
y = np.array(labels, dtype=np.int32)

vocab_size = len(tokenizer.word_index) + 1

print(f"Vocabulary size: {vocab_size}")
print(f"Max sequence length: {max_len}")
print(f"Training samples: {len(X)}")

# Improved RNN Model with Dropout
rnn_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_length=max_len),
    SimpleRNN(32, activation='tanh', return_sequences=False),
    Dropout(0.3),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Improved LSTM Model with Dropout
lstm_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_length=max_len),
    LSTM(32, activation='tanh', return_sequences=False),
    Dropout(0.3),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile
rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train with more epochs
print("\nTraining RNN...")
rnn_history = rnn_model.fit(X, y, epochs=50, verbose=0, validation_split=0.2)
print(f"Final RNN Accuracy: {rnn_history.history['accuracy'][-1]:.4f}")

print("\nTraining LSTM...")
lstm_history = lstm_model.fit(X, y, epochs=50, verbose=0, validation_split=0.2)
print(f"Final LSTM Accuracy: {lstm_history.history['accuracy'][-1]:.4f}")

# Test multiple sentences
test_sentences = [
    "The movie started off slow and confusing, but ended beautifully.",
    "Absolutely terrible from beginning to end.",
    "A stunning masterpiece with incredible performances."
]

print("\n" + "="*60)
print("PREDICTIONS")
print("="*60)

for test_sentence in test_sentences:
    print(f"\nTest: '{test_sentence}'")

    test_seq = tokenizer.texts_to_sequences([test_sentence])
    test_seq = pad_sequences(test_seq, maxlen=max_len, padding='post')

    rnn_pred = rnn_model.predict(test_seq, verbose=0)
    rnn_sentiment = "Positive" if rnn_pred[0][0] > 0.5 else "Negative"

    lstm_pred = lstm_model.predict(test_seq, verbose=0)
    lstm_sentiment = "Positive" if lstm_pred[0][0] > 0.5 else "Negative"

    print(f"  RNN:  {rnn_sentiment:8s} (confidence: {rnn_pred[0][0]:.4f})")
    print(f"  LSTM: {lstm_sentiment:8s} (confidence: {lstm_pred[0][0]:.4f})")

Vocabulary size: 74
Max sequence length: 17
Training samples: 10

Training RNN...




Final RNN Accuracy: 1.0000

Training LSTM...
Final LSTM Accuracy: 1.0000

PREDICTIONS

Test: 'The movie started off slow and confusing, but ended beautifully.'
  RNN:  Positive (confidence: 0.6361)
  LSTM: Negative (confidence: 0.2233)

Test: 'Absolutely terrible from beginning to end.'
  RNN:  Negative (confidence: 0.2130)
  LSTM: Negative (confidence: 0.1890)

Test: 'A stunning masterpiece with incredible performances.'
  RNN:  Positive (confidence: 0.5855)
  LSTM: Positive (confidence: 0.9898)


In [2]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, LSTM, Dense, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import random

# Set Seeds
tf.random.set_seed(42)
np.random.seed(42)
random.seed(42)

# Generate larger synthetic dataset
positive_templates = [
    "The movie was {adj1} and {adj2}, with {feature} that {verb}.",
    "Absolutely {adj1} film with {adj2} {feature}.",
    "A {adj1} masterpiece that {verb} the audience.",
    "{feature} was {adj1} and the ending {verb}.",
    "Brilliant {feature} and {adj2} performances throughout."
]

negative_templates = [
    "The movie was {adj1} and {adj2}, with {feature} that {verb}.",
    "Absolutely {adj1} film with {adj2} {feature}.",
    "The {feature} was {adj1} and {adj2}.",
    "{adj1} and {adj2} from start to finish.",
    "Terrible {feature} that {verb} the audience."
]

positive_words = {
    'adj1': ['amazing', 'brilliant', 'fantastic', 'wonderful', 'outstanding', 'spectacular', 'excellent', 'superb', 'incredible', 'marvelous'],
    'adj2': ['engaging', 'thrilling', 'beautiful', 'powerful', 'moving', 'captivating', 'stunning', 'emotional', 'heartwarming', 'uplifting'],
    'feature': ['cinematography', 'acting', 'storyline', 'directing', 'soundtrack', 'script', 'visuals', 'performances', 'plot', 'dialogue'],
    'verb': ['captivated', 'amazed', 'moved', 'impressed', 'touched', 'inspired', 'delighted', 'entertained', 'mesmerized', 'thrilled']
}

negative_words = {
    'adj1': ['terrible', 'awful', 'boring', 'disappointing', 'dreadful', 'horrible', 'poor', 'weak', 'mediocre', 'uninspired'],
    'adj2': ['confusing', 'slow', 'predictable', 'shallow', 'tedious', 'lifeless', 'bland', 'lackluster', 'forgettable', 'clichéd'],
    'feature': ['cinematography', 'acting', 'storyline', 'directing', 'soundtrack', 'script', 'visuals', 'performances', 'plot', 'dialogue'],
    'verb': ['disappointed', 'bored', 'confused', 'frustrated', 'annoyed', 'let down', 'failed', 'alienated', 'exhausted', 'irritated']
}

def generate_sentences(templates, word_dict, n=500):
    sentences = []
    for _ in range(n):
        template = random.choice(templates)
        sentence = template.format(
            adj1=random.choice(word_dict['adj1']),
            adj2=random.choice(word_dict['adj2']),
            feature=random.choice(word_dict['feature']),
            verb=random.choice(word_dict['verb'])
        )
        sentences.append(sentence)
    return sentences

# Generate 1000 positive and 1000 negative examples
positive_sentences = generate_sentences(positive_templates, positive_words, 1000)
negative_sentences = generate_sentences(negative_templates, negative_words, 1000)

sentences = positive_sentences + negative_sentences
labels = [1] * 1000 + [0] * 1000

# Shuffle
combined = list(zip(sentences, labels))
random.shuffle(combined)
sentences, labels = zip(*combined)

print(f"Total training samples: {len(sentences)}")
print(f"Sample positive: {sentences[0]}")
print(f"Sample negative: {sentences[-1]}")

# Tokenization
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

max_len = 30  # Fixed length for efficiency
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

X = np.array(padded_sequences, dtype=np.int32)
y = np.array(labels, dtype=np.int32)

vocab_size = len(tokenizer.word_index) + 1

# Train/Test Split
split_idx = int(0.8 * len(X))
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

print(f"\nVocabulary size: {vocab_size}")
print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")

# RNN Model
rnn_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_length=max_len),
    SimpleRNN(32, activation='tanh'),
    Dropout(0.5),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

# LSTM Model
lstm_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_length=max_len),
    LSTM(32, activation='tanh'),
    Dropout(0.5),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile
rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train RNN
print("\n" + "="*60)
print("Training RNN...")
print("="*60)
rnn_history = rnn_model.fit(X_train, y_train, epochs=10, batch_size=32,
                             validation_split=0.2, verbose=1)

# Train LSTM
print("\n" + "="*60)
print("Training LSTM...")
print("="*60)
lstm_history = lstm_model.fit(X_train, y_train, epochs=10, batch_size=32,
                               validation_split=0.2, verbose=1)

# Evaluate
print("\n" + "="*60)
print("EVALUATION ON TEST SET")
print("="*60)

rnn_loss, rnn_acc = rnn_model.evaluate(X_test, y_test, verbose=0)
lstm_loss, lstm_acc = lstm_model.evaluate(X_test, y_test, verbose=0)

print(f"\nRNN  - Test Accuracy: {rnn_acc:.4f}, Test Loss: {rnn_loss:.4f}")
print(f"LSTM - Test Accuracy: {lstm_acc:.4f}, Test Loss: {lstm_loss:.4f}")

# Test sentences
test_sentences = [
    "The movie was brilliant and captivating with amazing performances.",
    "Absolutely terrible film with boring and confusing plot.",
    "The acting was wonderful and the soundtrack was beautiful."
]

print("\n" + "="*60)
print("PREDICTIONS")
print("="*60)

for test_sentence in test_sentences:
    test_seq = tokenizer.texts_to_sequences([test_sentence])
    test_seq = pad_sequences(test_seq, maxlen=max_len, padding='post')

    rnn_pred = rnn_model.predict(test_seq, verbose=0)
    lstm_pred = lstm_model.predict(test_seq, verbose=0)

    print(f"\nTest: '{test_sentence}'")
    print(f"  RNN:  {'Positive' if rnn_pred[0][0] > 0.5 else 'Negative':8s} (confidence: {rnn_pred[0][0]:.4f})")
    print(f"  LSTM: {'Positive' if lstm_pred[0][0] > 0.5 else 'Negative':8s} (confidence: {lstm_pred[0][0]:.4f})")

Total training samples: 2000
Sample positive: uninspired and tedious from start to finish.
Sample negative: The movie was boring and predictable, with cinematography that irritated.

Vocabulary size: 90
Training samples: 1600
Testing samples: 400

Training RNN...
Epoch 1/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 18ms/step - accuracy: 0.7063 - loss: 0.5796 - val_accuracy: 1.0000 - val_loss: 0.0926
Epoch 2/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 1.0000 - loss: 0.0789 - val_accuracy: 1.0000 - val_loss: 0.0126
Epoch 3/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 1.0000 - loss: 0.0162 - val_accuracy: 1.0000 - val_loss: 0.0046
Epoch 4/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 1.0000 - loss: 0.0080 - val_accuracy: 1.0000 - val_loss: 0.0024
Epoch 5/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accur

In [3]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, LSTM, Dense, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import random

# Set Seeds
tf.random.set_seed(42)
np.random.seed(42)
random.seed(42)

# Create dataset with LONG-RANGE DEPENDENCIES and NEGATIONS
# This will challenge RNN but LSTM should handle better

def generate_complex_sentences(n=1000):
    """
    Generate sentences where sentiment depends on words far apart in the sequence.
    Uses negations, contrasts, and long-distance dependencies.
    """
    sentences = []
    labels = []

    # Positive starts, negative endings (label: 0 - Negative overall)
    neg_templates = [
        "The movie started with {pos1} scenes and {pos2} acting, however after the first hour it became {neg1} and {neg2}, ultimately leaving me {neg3}.",
        "Initially the film seemed {pos1} with {pos2} cinematography, but unfortunately the second half was {neg1} and the ending was {neg2}, making it {neg3} overall.",
        "Despite having {pos1} visuals and {pos2} music at the beginning, the movie deteriorated into a {neg1} mess with {neg2} pacing and {neg3} conclusion.",
        "The opening was {pos1} and the characters seemed {pos2}, yet as the story progressed everything became {neg1} and {neg2}, resulting in a {neg3} experience.",
        "Although it began with {pos1} promise and {pos2} performances, the film sadly turned {neg1} with a {neg2} plot and {neg3} ending.",
    ]

    # Negative starts, positive endings (label: 1 - Positive overall)
    pos_templates = [
        "The movie started with {neg1} pacing and {neg2} dialogue, however the final act was absolutely {pos1} and {pos2}, leaving me thoroughly {pos3}.",
        "Initially the film was {neg1} with {neg2} acting, but thankfully the second half became {pos1} and the conclusion was {pos2}, making it {pos3} overall.",
        "Despite having {neg1} beginning and {neg2} characters early on, the movie transformed into a {pos1} masterpiece with {pos2} emotional depth and {pos3} finale.",
        "The opening was {neg1} and the setup seemed {neg2}, yet as the story unfolded everything became {pos1} and {pos2}, resulting in a {pos3} experience.",
        "Although it began with {neg1} scenes and {neg2} writing, the film remarkably became {pos1} with a {pos2} climax and {pos3} ending.",
    ]

    positive_words = {
        'pos1': ['brilliant', 'amazing', 'fantastic', 'wonderful', 'stunning', 'excellent', 'superb', 'outstanding', 'remarkable', 'incredible'],
        'pos2': ['captivating', 'engaging', 'beautiful', 'powerful', 'moving', 'thrilling', 'emotional', 'touching', 'inspiring', 'uplifting'],
        'pos3': ['satisfied', 'impressed', 'delighted', 'moved', 'entertained', 'inspired', 'amazed', 'touched', 'fulfilled', 'overjoyed']
    }

    negative_words = {
        'neg1': ['boring', 'terrible', 'awful', 'disappointing', 'dull', 'weak', 'poor', 'mediocre', 'tedious', 'lackluster'],
        'neg2': ['confusing', 'frustrating', 'predictable', 'slow', 'shallow', 'lifeless', 'bland', 'uninspired', 'clichéd', 'forgettable'],
        'neg3': ['disappointed', 'unsatisfied', 'frustrated', 'bored', 'let down', 'annoyed', 'dissatisfied', 'underwhelmed', 'regretful', 'dismayed']
    }

    # Generate negative overall (starts positive, ends negative)
    for _ in range(n // 2):
        template = random.choice(neg_templates)
        sentence = template.format(
            pos1=random.choice(positive_words['pos1']),
            pos2=random.choice(positive_words['pos2']),
            neg1=random.choice(negative_words['neg1']),
            neg2=random.choice(negative_words['neg2']),
            neg3=random.choice(negative_words['neg3'])
        )
        sentences.append(sentence)
        labels.append(0)  # Negative overall

    # Generate positive overall (starts negative, ends positive)
    for _ in range(n // 2):
        template = random.choice(pos_templates)
        sentence = template.format(
            neg1=random.choice(negative_words['neg1']),
            neg2=random.choice(negative_words['neg2']),
            pos1=random.choice(positive_words['pos1']),
            pos2=random.choice(positive_words['pos2']),
            pos3=random.choice(positive_words['pos3'])
        )
        sentences.append(sentence)
        labels.append(1)  # Positive overall

    return sentences, labels

# Generate 2000 samples with long-range dependencies
sentences, labels = generate_complex_sentences(2000)

# Shuffle
combined = list(zip(sentences, labels))
random.shuffle(combined)
sentences, labels = zip(*combined)

print(f"Total samples: {len(sentences)}")
print(f"\nExample NEGATIVE (starts positive, ends negative):")
neg_example = [s for s, l in zip(sentences[:10], labels[:10]) if l == 0][0]
print(f"{neg_example}\n")
print(f"Example POSITIVE (starts negative, ends positive):")
pos_example = [s for s, l in zip(sentences[:10], labels[:10]) if l == 1][0]
print(f"{pos_example}\n")

# Tokenization
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

# Use longer max_len to preserve the full context
max_len = max(len(seq) for seq in sequences)
print(f"Maximum sequence length: {max_len}")

padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

X = np.array(padded_sequences, dtype=np.int32)
y = np.array(labels, dtype=np.int32)

vocab_size = len(tokenizer.word_index) + 1

# Train/Test Split (80/20)
split_idx = int(0.8 * len(X))
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]

print(f"Vocabulary size: {vocab_size}")
print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")

# RNN Model (will struggle with long sequences)
rnn_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_length=max_len),
    SimpleRNN(32, activation='tanh'),  # No return_sequences - only final state
    Dropout(0.3),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

# LSTM Model (should handle long-range dependencies better)
lstm_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_length=max_len),
    LSTM(32, activation='tanh'),  # Gates help maintain long-term memory
    Dropout(0.3),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile
rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train RNN
print("\n" + "="*60)
print("Training RNN...")
print("="*60)
rnn_history = rnn_model.fit(X_train, y_train, epochs=15, batch_size=32,
                             validation_split=0.2, verbose=1)

# Train LSTM
print("\n" + "="*60)
print("Training LSTM...")
print("="*60)
lstm_history = lstm_model.fit(X_train, y_train, epochs=15, batch_size=32,
                               validation_split=0.2, verbose=1)

# Evaluate
print("\n" + "="*60)
print("EVALUATION ON TEST SET")
print("="*60)

rnn_loss, rnn_acc = rnn_model.evaluate(X_test, y_test, verbose=0)
lstm_loss, lstm_acc = lstm_model.evaluate(X_test, y_test, verbose=0)

print(f"\nRNN  - Test Accuracy: {rnn_acc:.4f}, Test Loss: {rnn_loss:.4f}")
print(f"LSTM - Test Accuracy: {lstm_acc:.4f}, Test Loss: {lstm_loss:.4f}")
print(f"\nPerformance Gap: {(lstm_acc - rnn_acc) * 100:.2f}% better with LSTM")

# Test sentences with long-range dependencies
test_sentences = [
    "The movie started with brilliant scenes and amazing acting, however after the first hour it became boring and terrible, ultimately leaving me disappointed.",
    "Initially the film was terrible with boring dialogue, but thankfully the second half became fantastic and the conclusion was beautiful, making it wonderful overall.",
    "Despite having stunning visuals at the beginning, the movie deteriorated into a confusing mess with slow pacing.",
]

print("\n" + "="*60)
print("PREDICTIONS ON COMPLEX SENTENCES")
print("="*60)

for test_sentence in test_sentences:
    test_seq = tokenizer.texts_to_sequences([test_sentence])
    test_seq = pad_sequences(test_seq, maxlen=max_len, padding='post')

    rnn_pred = rnn_model.predict(test_seq, verbose=0)
    lstm_pred = lstm_model.predict(test_seq, verbose=0)

    print(f"\nTest: '{test_sentence[:80]}...'")
    print(f"  RNN:  {'Positive' if rnn_pred[0][0] > 0.5 else 'Negative':8s} (confidence: {rnn_pred[0][0]:.4f})")
    print(f"  LSTM: {'Positive' if lstm_pred[0][0] > 0.5 else 'Negative':8s} (confidence: {lstm_pred[0][0]:.4f})")

Total samples: 2000

Example NEGATIVE (starts positive, ends negative):
Although it began with wonderful promise and powerful performances, the film sadly turned terrible with a shallow plot and dissatisfied ending.

Example POSITIVE (starts negative, ends positive):
Initially the film was disappointing with frustrating acting, but thankfully the second half became amazing and the conclusion was touching, making it satisfied overall.

Maximum sequence length: 25
Vocabulary size: 137
Training samples: 1600
Testing samples: 400

Training RNN...
Epoch 1/15
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step - accuracy: 0.8267 - loss: 0.4741 - val_accuracy: 1.0000 - val_loss: 0.0348
Epoch 2/15
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 1.0000 - loss: 0.0291 - val_accuracy: 1.0000 - val_loss: 0.0057
Epoch 3/15
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 1.0000 - loss: 0.0076 - val_accur




Test: 'The movie started with brilliant scenes and amazing acting, however after the fi...'
  RNN:  Negative (confidence: 0.0001)
  LSTM: Negative (confidence: 0.0000)

Test: 'Initially the film was terrible with boring dialogue, but thankfully the second ...'
  RNN:  Positive (confidence: 0.9996)
  LSTM: Positive (confidence: 0.9998)

Test: 'Despite having stunning visuals at the beginning, the movie deteriorated into a ...'
  RNN:  Negative (confidence: 0.0007)
  LSTM: Negative (confidence: 0.0000)


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, LSTM, Dense, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import random
from tqdm import tqdm  # For progress bar

# Set Seeds
tf.random.set_seed(42)
np.random.seed(42)
random.seed(42)

print("Generating 1 million training samples with long-range dependencies...")
print("This may take a few minutes...\n")

def generate_complex_sentences_batch(n=1000000):
    """
    Generate 1 million sentences where sentiment depends on words far apart.
    Optimized for speed using list comprehension and batch processing.
    """

    # Negative overall templates (starts positive, ends negative)
    neg_templates = [
        "The movie started with {pos1} scenes and {pos2} acting, however after the first hour it became {neg1} and {neg2}, ultimately leaving me {neg3}.",
        "Initially the film seemed {pos1} with {pos2} cinematography, but unfortunately the second half was {neg1} and the ending was {neg2}, making it {neg3} overall.",
        "Despite having {pos1} visuals and {pos2} music at the beginning, the movie deteriorated into a {neg1} mess with {neg2} pacing and {neg3} conclusion.",
        "The opening was {pos1} and the characters seemed {pos2}, yet as the story progressed everything became {neg1} and {neg2}, resulting in a {neg3} experience.",
        "Although it began with {pos1} promise and {pos2} performances, the film sadly turned {neg1} with a {neg2} plot and {neg3} ending.",
        "What started as a {pos1} journey with {pos2} direction quickly devolved into {neg1} chaos with {neg2} writing and a {neg3} finale.",
        "The film opened with {pos1} energy and {pos2} screenplay, but collapsed into {neg1} tedium with {neg2} execution and {neg3} payoff.",
    ]

    # Positive overall templates (starts negative, ends positive)
    pos_templates = [
        "The movie started with {neg1} pacing and {neg2} dialogue, however the final act was absolutely {pos1} and {pos2}, leaving me thoroughly {pos3}.",
        "Initially the film was {neg1} with {neg2} acting, but thankfully the second half became {pos1} and the conclusion was {pos2}, making it {pos3} overall.",
        "Despite having {neg1} beginning and {neg2} characters early on, the movie transformed into a {pos1} masterpiece with {pos2} emotional depth and {pos3} finale.",
        "The opening was {neg1} and the setup seemed {neg2}, yet as the story unfolded everything became {pos1} and {pos2}, resulting in a {pos3} experience.",
        "Although it began with {neg1} scenes and {neg2} writing, the film remarkably became {pos1} with a {pos2} climax and {pos3} ending.",
        "What started as a {neg1} slog with {neg2} characters eventually evolved into {pos1} brilliance with {pos2} depth and a {pos3} conclusion.",
        "The film opened with {neg1} confusion and {neg2} pacing, but transformed into {pos1} excellence with {pos2} resolution and {pos3} impact.",
    ]

    positive_words = {
        'pos1': ['brilliant', 'amazing', 'fantastic', 'wonderful', 'stunning', 'excellent', 'superb', 'outstanding', 'remarkable', 'incredible',
                 'spectacular', 'magnificent', 'phenomenal', 'extraordinary', 'breathtaking', 'dazzling', 'exceptional', 'glorious', 'splendid', 'marvelous'],
        'pos2': ['captivating', 'engaging', 'beautiful', 'powerful', 'moving', 'thrilling', 'emotional', 'touching', 'inspiring', 'uplifting',
                 'compelling', 'gripping', 'mesmerizing', 'enchanting', 'riveting', 'absorbing', 'enthralling', 'stirring', 'poignant', 'evocative'],
        'pos3': ['satisfied', 'impressed', 'delighted', 'moved', 'entertained', 'inspired', 'amazed', 'touched', 'fulfilled', 'overjoyed',
                 'elated', 'thrilled', 'enchanted', 'captivated', 'mesmerized', 'awestruck', 'gratified', 'pleased', 'content', 'euphoric']
    }

    negative_words = {
        'neg1': ['boring', 'terrible', 'awful', 'disappointing', 'dull', 'weak', 'poor', 'mediocre', 'tedious', 'lackluster',
                 'uninspired', 'lifeless', 'dreary', 'monotonous', 'insipid', 'vapid', 'stale', 'flat', 'tiresome', 'wearisome'],
        'neg2': ['confusing', 'frustrating', 'predictable', 'slow', 'shallow', 'lifeless', 'bland', 'uninspired', 'clichéd', 'forgettable',
                 'convoluted', 'disjointed', 'incoherent', 'muddled', 'derivative', 'hackneyed', 'trite', 'banal', 'formulaic', 'unoriginal'],
        'neg3': ['disappointed', 'unsatisfied', 'frustrated', 'bored', 'let down', 'annoyed', 'dissatisfied', 'underwhelmed', 'regretful', 'dismayed',
                 'disheartened', 'disillusioned', 'deflated', 'disenchanted', 'dejected', 'dispirited', 'displeased', 'aggrieved', 'vexed', 'irked']
    }

    sentences = []
    labels = []

    # Generate in batches for progress tracking
    batch_size = 100000
    num_batches = n // batch_size

    for batch in tqdm(range(num_batches), desc="Generating data"):
        batch_sentences = []
        batch_labels = []

        # Generate negative overall (starts positive, ends negative)
        for _ in range(batch_size // 2):
            template = random.choice(neg_templates)
            sentence = template.format(
                pos1=random.choice(positive_words['pos1']),
                pos2=random.choice(positive_words['pos2']),
                neg1=random.choice(negative_words['neg1']),
                neg2=random.choice(negative_words['neg2']),
                neg3=random.choice(negative_words['neg3'])
            )
            batch_sentences.append(sentence)
            batch_labels.append(0)

        # Generate positive overall (starts negative, ends positive)
        for _ in range(batch_size // 2):
            template = random.choice(pos_templates)
            sentence = template.format(
                neg1=random.choice(negative_words['neg1']),
                neg2=random.choice(negative_words['neg2']),
                pos1=random.choice(positive_words['pos1']),
                pos2=random.choice(positive_words['pos2']),
                pos3=random.choice(positive_words['pos3'])
            )
            batch_sentences.append(sentence)
            batch_labels.append(1)

        sentences.extend(batch_sentences)
        labels.extend(batch_labels)

    return sentences, labels

# Generate 1 million samples
sentences, labels = generate_complex_sentences_batch(1000000)

# Shuffle
print("\nShuffling data...")
combined = list(zip(sentences, labels))
random.shuffle(combined)
sentences, labels = zip(*combined)
sentences = list(sentences)
labels = list(labels)

print(f"Total samples generated: {len(sentences):,}")
print(f"\nExample NEGATIVE (starts positive, ends negative):")
neg_example = [s for s, l in zip(sentences[:100], labels[:100]) if l == 0][0]
print(f"{neg_example}\n")
print(f"Example POSITIVE (starts negative, ends positive):")
pos_example = [s for s, l in zip(sentences[:100], labels[:100]) if l == 1][0]
print(f"{pos_example}\n")

# Tokenization
print("Tokenizing sentences...")
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

# Use fixed max_len for efficiency
max_len = 40  # Most sentences will fit, truncate if needed
print(f"Using max sequence length: {max_len}")

print("Padding sequences...")
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

X = np.array(padded_sequences, dtype=np.int32)
y = np.array(labels, dtype=np.int32)

vocab_size = len(tokenizer.word_index) + 1

# Train/Validation/Test Split (70/15/15)
train_split = int(0.7 * len(X))
val_split = int(0.85 * len(X))

X_train, X_val, X_test = X[:train_split], X[train_split:val_split], X[val_split:]
y_train, y_val, y_test = y[:train_split], y[train_split:val_split], y[val_split:]

print(f"\nVocabulary size: {vocab_size:,}")
print(f"Training samples: {len(X_train):,}")
print(f"Validation samples: {len(X_val):,}")
print(f"Testing samples: {len(X_test):,}")

# RNN Model (will struggle with long sequences)
print("\nBuilding RNN model...")
rnn_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_length=max_len),
    SimpleRNN(32, activation='tanh'),
    Dropout(0.3),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

# LSTM Model (should handle long-range dependencies better)
print("Building LSTM model...")
lstm_model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_length=max_len),
    LSTM(32, activation='tanh'),
    Dropout(0.3),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile
rnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print("\n" + "="*60)
print("MODEL SUMMARIES")
print("="*60)
print("\nRNN Model:")
rnn_model.summary()
print("\nLSTM Model:")
lstm_model.summary()

# Train RNN
print("\n" + "="*60)
print("Training RNN on 1 Million Samples...")
print("="*60)
rnn_history = rnn_model.fit(
    X_train, y_train,
    epochs=5,  # Fewer epochs due to large dataset
    batch_size=256,  # Larger batch size for efficiency
    validation_data=(X_val, y_val),
    verbose=1
)

# Train LSTM
print("\n" + "="*60)
print("Training LSTM on 1 Million Samples...")
print("="*60)
lstm_history = lstm_model.fit(
    X_train, y_train,
    epochs=5,  # Fewer epochs due to large dataset
    batch_size=256,  # Larger batch size for efficiency
    validation_data=(X_val, y_val),
    verbose=1
)

# Evaluate on Test Set
print("\n" + "="*60)
print("FINAL EVALUATION ON TEST SET (150,000 samples)")
print("="*60)

rnn_loss, rnn_acc = rnn_model.evaluate(X_test, y_test, batch_size=256, verbose=1)
lstm_loss, lstm_acc = lstm_model.evaluate(X_test, y_test, batch_size=256, verbose=1)

print("\n" + "="*60)
print("RESULTS COMPARISON")
print("="*60)
print(f"\nRNN  - Test Accuracy: {rnn_acc*100:.2f}%, Test Loss: {rnn_loss:.4f}")
print(f"LSTM - Test Accuracy: {lstm_acc*100:.2f}%, Test Loss: {lstm_loss:.4f}")
print(f"\n{'LSTM WINS!' if lstm_acc > rnn_acc else 'RNN WINS!'}")
print(f"Performance Gap: {abs(lstm_acc - rnn_acc)*100:.2f}% ({'LSTM' if lstm_acc > rnn_acc else 'RNN'} is better)")

# Test on complex sentences
test_sentences = [
    "The movie started with brilliant scenes and amazing acting, however after the first hour it became boring and terrible, ultimately leaving me disappointed.",
    "Initially the film was terrible with boring dialogue, but thankfully the second half became fantastic and the conclusion was beautiful, making it wonderful overall.",
    "Despite having stunning visuals at the beginning, the movie deteriorated into a confusing mess with slow pacing and forgettable conclusion.",
    "What started as a tedious slog with poor characters eventually evolved into magnificent brilliance with powerful depth and a spectacular conclusion.",
    "Although it began with disappointing promise and weak performances, the film remarkably became outstanding with an excellent climax and amazing ending.",
]

print("\n" + "="*60)
print("PREDICTIONS ON COMPLEX TEST SENTENCES")
print("="*60)

for i, test_sentence in enumerate(test_sentences, 1):
    test_seq = tokenizer.texts_to_sequences([test_sentence])
    test_seq = pad_sequences(test_seq, maxlen=max_len, padding='post')

    rnn_pred = rnn_model.predict(test_seq, verbose=0)
    lstm_pred = lstm_model.predict(test_seq, verbose=0)

    # Determine actual label based on sentence structure
    actual = "Negative" if any(word in test_sentence.lower() for word in ["disappointed", "forgettable", "deteriorated"]) and i <= 3 else "Positive"

    rnn_result = 'Positive' if rnn_pred[0][0] > 0.5 else 'Negative'
    lstm_result = 'Positive' if lstm_pred[0][0] > 0.5 else 'Negative'

    print(f"\n[Test {i}] {test_sentence[:70]}...")
    print(f"  Actual:  {actual}")
    print(f"  RNN:     {rnn_result:8s} (confidence: {rnn_pred[0][0]:.4f}) {'✓' if rnn_result == actual else '✗'}")
    print(f"  LSTM:    {lstm_result:8s} (confidence: {lstm_pred[0][0]:.4f}) {'✓' if lstm_result == actual else '✗'}")

# Save models
print("\n" + "="*60)
print("Saving models...")
rnn_model.save('/home/claude/rnn_sentiment_1M.h5')
lstm_model.save('/home/claude/lstm_sentiment_1M.h5')
print("Models saved successfully!")
print("="*60)

Generating 1 million training samples with long-range dependencies...
This may take a few minutes...



Generating data: 100%|██████████| 10/10 [00:03<00:00,  2.56it/s]



Shuffling data...
Total samples generated: 1,000,000

Example NEGATIVE (starts positive, ends negative):
The opening was spectacular and the characters seemed stirring, yet as the story progressed everything became uninspired and hackneyed, resulting in a dissatisfied experience.

Example POSITIVE (starts negative, ends positive):
The movie started with disappointing pacing and convoluted dialogue, however the final act was absolutely superb and stirring, leaving me thoroughly overjoyed.

Tokenizing sentences...
Using max sequence length: 40
Padding sequences...

Vocabulary size: 216
Training samples: 700,000
Validation samples: 150,000
Testing samples: 150,000

Building RNN model...
Building LSTM model...

MODEL SUMMARIES

RNN Model:



LSTM Model:



Training RNN on 1 Million Samples...
Epoch 1/5
[1m2735/2735[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 25ms/step - accuracy: 0.9956 - loss: 0.0241 - val_accuracy: 1.0000 - val_loss: 2.6361e-06
Epoch 2/5
[1m2735/2735[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 25ms/step - accuracy: 1.0000 - loss: 6.0492e-06 - val_accuracy: 1.0000 - val_loss: 2.3585e-07
Epoch 3/5
[1m2735/2735[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 25ms/step - accuracy: 1.0000 - loss: 9.9957e-07 - val_accuracy: 1.0000 - val_loss: 2.9362e-08
Epoch 4/5
[1m2735/2735[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 25ms/step - accuracy: 1.0000 - loss: 2.1545e-07 - val_accuracy: 1.0000 - val_loss: 4.1439e-09
Epoch 5/5
[1m2735/2735[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 27ms/step - accuracy: 1.0000 - loss: 5.4471e-08 - val_accuracy: 1.0000 - val_loss: 5.6968e-10

Training LSTM on 1 Million Samples...
Epoch 1/5
[1m2735/2735[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m