In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load IMDb dataset
dataset, info = tfds.load("imdb_reviews", split=["train", "test"], as_supervised=True, with_info=True)

# Convert dataset to lists
train_data, test_data = dataset
train_sentences, train_labels = [], []
test_sentences, test_labels = [], []

for text, label in train_data:
    train_sentences.append(str(text.numpy().decode("utf-8")))
    train_labels.append(label.numpy())

for text, label in test_data:
    test_sentences.append(str(text.numpy().decode("utf-8")))
    test_labels.append(label.numpy())

# Tokenization and Padding
vocab_size = 10000  # Only keep the top 10,000 most frequent words
max_length = 200  # Maximum length of input sequences

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(train_sentences)

X_train = tokenizer.texts_to_sequences(train_sentences)
X_test = tokenizer.texts_to_sequences(test_sentences)

X_train = pad_sequences(X_train, maxlen=max_length, padding="post", truncating="post")
X_test = pad_sequences(X_test, maxlen=max_length, padding="post", truncating="post")

y_train = tf.convert_to_tensor(train_labels)
y_test = tf.convert_to_tensor(test_labels)

# Define LSTM Model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64, input_length=max_length),  # Word Embedding Layer
    LSTM(64, return_sequences=False),  # LSTM Layer with 64 units
    Dense(1, activation="sigmoid")  # Output layer (binary classification)
])

# Compile the model
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

# Make Predictions
sample_text = ["The movie was fantastic! I loved it.", "It was a terrible film, I hated it."]
sample_sequences = tokenizer.texts_to_sequences(sample_text)
sample_padded = pad_sequences(sample_sequences, maxlen=max_length, padding="post", truncating="post")

predictions = model.predict(sample_padded)
predicted_labels = ["Positive" if p > 0.5 else "Negative" for p in predictions]

for text, label in zip(sample_text, predicted_labels):
    print(f"Review: {text} → Sentiment: {label}")


2025-03-17 17:03:46.495059: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-03-17 17:04:01.743674: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
2025-03-17 17:04:09.669199: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Epoch 1/10




[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 131ms/step - accuracy: 0.5247 - loss: 0.6899 - val_accuracy: 0.5000 - val_loss: 0.6989
Epoch 2/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 133ms/step - accuracy: 0.5784 - loss: 0.6670 - val_accuracy: 0.7745 - val_loss: 0.5191
Epoch 3/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 136ms/step - accuracy: 0.8470 - loss: 0.3846 - val_accuracy: 0.8540 - val_loss: 0.3524
Epoch 4/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 130ms/step - accuracy: 0.9141 - loss: 0.2360 - val_accuracy: 0.8395 - val_loss: 0.4002
Epoch 5/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 134ms/step - accuracy: 0.9452 - loss: 0.1672 - val_accuracy: 0.8456 - val_loss: 0.4443
Epoch 6/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 156ms/step - accuracy: 0.9676 - loss: 0.1130 - val_accuracy: 0.8432 - val_loss: 0.4407
Epoch 7/10
[1m