In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam



In [None]:
## Load the IMDB dataset

vocab_size = 10000
(X_train,y_train), (X_test,y_test) = imdb.load_data(num_words=vocab_size)
print(f'X_train: {X_train.shape} - y_train: {y_train.shape}  X_test:{X_test.shape} - y_test:{y_test.shape}')

In [None]:
#Review X_train 

sample_review = X_train[0]
sample_review


In [None]:
# Mapping of word index back to words

word_to_index = imdb.get_word_index() # Returns all the word to index dict of vocab

index_to_word = {index+3:word for word,index in word_to_index.items()} #Reversing the word to index - index to word


''' 
Why add + 3 to the index?

imdb.get_word_index() does not account for the reserved indices for 
special tokens (<PAD>, <START>, <UNK>, <UNUSED>). 

When loading the dataset using imdb.load_data, the data is preprocessed to include reserved tokens:
0 for <PAD>: Used for padding sequences to the same length.
1 for <START>: Marks the beginning of a review.
2 for <UNK>: Replaces words that are not in the top num_words most frequent words.
3 for <UNUSED>: Reserved for future use.

As a result, the indices in the reviews (e.g., X_train, X_test) 
start from 4, and the word indices need to align accordingly.

'''
# Decode X_train[item]-> word indices to words

def decode_review(review_index=0):
    """
    Decodes a review from the IMDB dataset using index_to_word mapping.

    Args:
        review_index (int): The index of the review in X_train to decode. Defaults to 0.

    Returns:
        str: The decoded review as a string of words.
    """
    return " ".join(index_to_word.get(index, '<UNK>') for index in X_train[review_index])

# Example usage
item = 0
print(f"X_train[{item}] decoded review: {decode_review(item)}")

In [None]:
# Padding the each item in X_train and X_test to have max length

X_train = sequence.pad_sequences(X_train,maxlen=500)
X_test = sequence.pad_sequences(X_test,maxlen=500)

'''
By default it takes 'pre' padding
'''

In [None]:
# Train Simple RNN

# Initialize the model
model = Sequential()

# Embedding layer: Converts integer indices into dense vectors of fixed size (128)
model.add(Embedding(vocab_size, 128, input_length=500))

# SimpleRNN layer: RNN with 128 neurons
model.add(SimpleRNN(128, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

# Dropout Layer for Regularization
model.add(Dropout(0.2)) # 20% Dropout rate

# Dense output layer with a single neuron (for binary classification)
model.add(Dense(1, activation='sigmoid', kernel_regularizer=regularizers.l2(0.001)))

# Build the model with the input shape
model.build(input_shape=(None, 500))  # The input shape should match the shape of the training data

optimizer = Adam(learning_rate = 1e-4)

model.compile(optimizer=optimizer, loss ='binary_crossentropy', metrics=['accuracy'])


In [None]:
model.summary()

In [None]:
#Setting up EarlyStopping

from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss',patience=2,restore_best_weights=True)

early_stopping

In [None]:
# Traing the Model with EarlyStopping

model.fit(X_train,
          y_train,
          epochs=10,
          batch_size=32,
          validation_split =0.2,
          callbacks=[early_stopping]
          )

In [None]:
# Save the model file

model.save('rnn_imdb.h5')