In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam



In [2]:
## Load the IMDB dataset

vocab_size = 10000
(X_train,y_train), (X_test,y_test) = imdb.load_data(num_words=vocab_size)
print(f'X_train: {X_train.shape} - y_train: {y_train.shape}  X_test:{X_test.shape} - y_test:{y_test.shape}')

X_train: (25000,) - y_train: (25000,)  X_test:(25000,) - y_test:(25000,)


In [13]:
#Review X_train 

sample_review = X_train[0]
sample_review


1
14
47
8
30
31
7
4
249
108
7
4
5974
54
61
369
13
71
149
14
22
112
4
2401
311
12
16
3711
33
75
43
1829
296
4
86
320
35
534
19
263
4821
1301
4
1873
33
89
78
12
66
16
4
360
7
4
58
316
334
11
4
1716
43
645
662
8
257
85
1200
42
1228
2578
83
68
3912
15
36
165
1539
278
36
69
2
780
8
106
14
6905
1338
18
6
22
12
215
28
610
40
6
87
326
23
2300
21
23
22
12
272
40
57
31
11
4
22
47
6
2307
51
9
170
23
595
116
595
1352
13
191
79
638
89
2
14
9
8
106
607
624
35
534
6
227
7
129
113


In [4]:
# Mapping of word index back to words

word_to_index = imdb.get_word_index() # Returns all the word to index dict of vocab

index_to_word = {index+3:word for word,index in word_to_index.items()} #Reversing the word to index - index to word


''' 
Why add + 3 to the index?

imdb.get_word_index() does not account for the reserved indices for 
special tokens (<PAD>, <START>, <UNK>, <UNUSED>). 

When loading the dataset using imdb.load_data, the data is preprocessed to include reserved tokens:
0 for <PAD>: Used for padding sequences to the same length.
1 for <START>: Marks the beginning of a review.
2 for <UNK>: Replaces words that are not in the top num_words most frequent words.
3 for <UNUSED>: Reserved for future use.

As a result, the indices in the reviews (e.g., X_train, X_test) 
start from 4, and the word indices need to align accordingly.

'''
# Decode X_train[item]-> word indices to words

def decode_review(review_index=0):
    """
    Decodes a review from the IMDB dataset using index_to_word mapping.

    Args:
        review_index (int): The index of the review in X_train to decode. Defaults to 0.

    Returns:
        str: The decoded review as a string of words.
    """
    return " ".join(index_to_word.get(index, '<UNK>') for index in X_train[review_index])

# Example usage
item = 0
print(f"X_train[{item}] decoded review: {decode_review(item)}")

X_train[0] decoded review: <UNK> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert <UNK> is an amazing actor and now the same being director <UNK> father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for <UNK> and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also <UNK> to the two little boy's that played the <UNK> of norman and paul they were just brilliant children are often left out of the <UNK> list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and 

In [5]:
# Padding the each item in X_train and X_test to have max length

X_train = sequence.pad_sequences(X_train,maxlen=500)
X_test = sequence.pad_sequences(X_test,maxlen=500)

'''
By default it takes 'pre' padding
'''

"\nBy default it takes 'pre' padding\n"

In [6]:
# Train Simple RNN

# Initialize the model
model = Sequential()

# Embedding layer: Converts integer indices into dense vectors of fixed size (128)
model.add(Embedding(vocab_size, 128, input_length=500))

# SimpleRNN layer: RNN with 128 neurons
model.add(SimpleRNN(128, activation='relu', kernel_regularizer=regularizers.l2(0.001)))

# Dropout Layer for Regularization
model.add(Dropout(0.2)) # 20% Dropout rate

# Dense output layer with a single neuron (for binary classification)
model.add(Dense(1, activation='sigmoid', kernel_regularizer=regularizers.l2(0.001)))

# Build the model with the input shape
model.build(input_shape=(None, 500))  # The input shape should match the shape of the training data

optimizer = Adam(learning_rate = 1e-4)

model.compile(optimizer=optimizer, loss ='binary_crossentropy', metrics=['accuracy'])




In [7]:
model.summary()

In [8]:
#Setting up EarlyStopping

from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss',patience=2,restore_best_weights=True)

early_stopping

<keras.src.callbacks.early_stopping.EarlyStopping at 0x1b696ca51c0>

In [9]:
# Traing the Model with EarlyStopping

model.fit(X_train,
          y_train,
          epochs=10,
          batch_size=32,
          validation_split =0.2,
          callbacks=[early_stopping]
          )

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 55ms/step - accuracy: 0.5307 - loss: 0.8030 - val_accuracy: 0.5946 - val_loss: 0.7339
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 50ms/step - accuracy: 0.6835 - loss: 0.6771 - val_accuracy: 0.8562 - val_loss: 0.4163
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 53ms/step - accuracy: 0.8704 - loss: 0.3821 - val_accuracy: 0.8764 - val_loss: 0.3506
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 48ms/step - accuracy: 0.9081 - loss: 0.2891 - val_accuracy: 0.8630 - val_loss: 0.3658
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 49ms/step - accuracy: 0.9226 - loss: 0.2549 - val_accuracy: 0.8882 - val_loss: 0.3289
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 48ms/step - accuracy: 0.8984 - loss: 0.3134 - val_accuracy: 0.8904 - val_loss: 0.3310
Epoch 7/10
[1m6

<keras.src.callbacks.history.History at 0x1b6feb0cad0>

In [10]:
# Save the model file

model.save('rnn_imdb.h5')

