
# LSTM Example – Sentiment Analysis on IMDB Reviews  

This notebook demonstrates building a **Bidirectional LSTM** using **TensorFlow/Keras** to classify movie reviews by sentiment.

**Learning goals**  
1. Load the IMDB dataset via `tensorflow_datasets`.  
2. Tokenise & pad text, build train/val/test splits.  
3. Baseline bag‑of‑words logistic regression.  
4. Build, train & tune an LSTM.  
5. Evaluate with accuracy & confusion matrix.  
6. Save model + tokenizer.


In [None]:

# !pip install -q tensorflow tensorflow_datasets scikit-learn matplotlib seaborn
import tensorflow as tf
import tensorflow_datasets as tfds
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np, matplotlib.pyplot as plt


In [None]:

(ds_train_raw, ds_test_raw), info = tfds.load('imdb_reviews', split=['train', 'test'], with_info=True, as_supervised=True)
BUFFER = 20000
ds_train_raw = ds_train_raw.shuffle(BUFFER, reshuffle_each_iteration=False)
train_size = int(0.8 * info.splits['train'].num_examples)
ds_val_raw = ds_train_raw.skip(train_size)
ds_train_raw = ds_train_raw.take(train_size)


### Baseline - TF‑IDF Bag‑of‑Words + Logistic Regression

In [None]:
# ------------------------------------------------------------------
# Baseline  —  TF‑IDF Bag‑of‑Words + Logistic Regression
# ------------------------------------------------------------------
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 1. Collect raw texts and labels from the TensorFlow‑datasets splits
train_texts = [t.numpy().decode('utf‑8') for t, _ in ds_train_raw]   # ds_train_raw from earlier
train_labels = [int(l.numpy()) for _, l in ds_train_raw]

val_texts   = [t.numpy().decode('utf‑8') for t, _ in ds_val_raw]
val_labels  = [int(l.numpy()) for _, l in ds_val_raw]

test_texts  = [t.numpy().decode('utf‑8') for t, _ in ds_test_raw]
test_labels = [int(l.numpy()) for _, l in ds_test_raw]

# 2. Text → TF‑IDF vectors
# Use TfidfVectorizer 
# create X_train, X_val, X_test

#  Train Logistic Regression

#  Evaluate

# Print validation accuracy, test accuracy, classification report, confusion matrix


### Model with bidirectional LSTM

In [None]:

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts([text.numpy().decode() for text,_ in ds_train_raw])
MAXLEN = 300
def encode(text,label):
    seq = tokenizer.texts_to_sequences([text.numpy().decode()])[0]
    seq = tf.keras.preprocessing.sequence.pad_sequences([seq], maxlen=MAXLEN, padding='post')[0]
    return seq, label
def tf_encode(text,label):
    seq, label = tf.py_function(encode, inp=[text,label], Tout=[tf.int64, tf.int64])
    seq.set_shape([MAXLEN]); label.set_shape([]); return seq, label
batch=64
train_ds = ds_train_raw.map(tf_encode).batch(batch).prefetch(tf.data.AUTOTUNE)
val_ds   = ds_val_raw.map(tf_encode).batch(batch).prefetch(tf.data.AUTOTUNE)
test_ds  = ds_test_raw.map(tf_encode).batch(batch).prefetch(tf.data.AUTOTUNE)


In [None]:

# Prepare the LSTM model with tf.keras.Sequential
# Add Embedding layer, Bidirectional LSTM layers, Dense layers
# Compile and train the model

In [None]:

y_true, y_pred = [], []
# Evaluate the model
# Print evaluation reports

In [None]:
### Save the model

In [None]:

model.save('imdb_lstm_model.keras')
with open('tokenizer.json','w') as f:
    f.write(tokenizer.to_json())


### Compare the LSTM and Tf-Idf+LR evaluation results
Provide your analysis on the observations