## End to end Deep Learning Project Using Simple RNN

In [45]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,SimpleRNN,Dense, Input
import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' # disable onednn for Nvidia training only

In [46]:
## Load the imdb dataset

vocabulary_size=10000 ##vocabulary size
(X_train,y_train),(X_test,y_test)=imdb.load_data(num_words=vocabulary_size)

# Print the shape of the data
print(f'Training data shape: {X_train.shape}, Training labels shape: {y_train.shape}')
print(f'Testing data shape: {X_train.shape}, Testing labels shape: {y_test.shape}')

Training data shape: (25000,), Training labels shape: (25000,)
Testing data shape: (25000,), Testing labels shape: (25000,)


In [47]:
X_train[0],y_train[0]

([1,
  14,
  22,
  16,
  43,
  530,
  973,
  1622,
  1385,
  65,
  458,
  4468,
  66,
  3941,
  4,
  173,
  36,
  256,
  5,
  25,
  100,
  43,
  838,
  112,
  50,
  670,
  2,
  9,
  35,
  480,
  284,
  5,
  150,
  4,
  172,
  112,
  167,
  2,
  336,
  385,
  39,
  4,
  172,
  4536,
  1111,
  17,
  546,
  38,
  13,
  447,
  4,
  192,
  50,
  16,
  6,
  147,
  2025,
  19,
  14,
  22,
  4,
  1920,
  4613,
  469,
  4,
  22,
  71,
  87,
  12,
  16,
  43,
  530,
  38,
  76,
  15,
  13,
  1247,
  4,
  22,
  17,
  515,
  17,
  12,
  16,
  626,
  18,
  2,
  5,
  62,
  386,
  12,
  8,
  316,
  8,
  106,
  5,
  4,
  2223,
  5244,
  16,
  480,
  66,
  3785,
  33,
  4,
  130,
  12,
  16,
  38,
  619,
  5,
  25,
  124,
  51,
  36,
  135,
  48,
  25,
  1415,
  33,
  6,
  22,
  12,
  215,
  28,
  77,
  52,
  5,
  14,
  407,
  16,
  82,
  2,
  8,
  4,
  107,
  117,
  5952,
  15,
  256,
  4,
  2,
  7,
  3766,
  5,
  723,
  36,
  71,
  43,
  530,
  476,
  26,
  400,
  317,
  46,
  7,
  4,
  2,
  1029,
  

In [48]:
## Inspect a sample review and its label
sample_review=X_train[0]
sample_label=y_train[0]

print(f"Sample review (as integers):{sample_review}")
print(f'Sample label: {sample_label}')


Sample review (as integers):[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
Sample label: 1


In [49]:
### MApping of words index bacl to words(for understanding)
word_index=imdb.get_word_index()
#reverse the key to become value, and value to become key in the new dictionary
reverse_word_index = {value: key for key, value in word_index.items()}
reverse_word_index

{34701: 'fawn',
 52006: 'tsukino',
 52007: 'nunnery',
 16816: 'sonja',
 63951: 'vani',
 1408: 'woods',
 16115: 'spiders',
 2345: 'hanging',
 2289: 'woody',
 52008: 'trawling',
 52009: "hold's",
 11307: 'comically',
 40830: 'localized',
 30568: 'disobeying',
 52010: "'royale",
 40831: "harpo's",
 52011: 'canet',
 19313: 'aileen',
 52012: 'acurately',
 52013: "diplomat's",
 25242: 'rickman',
 6746: 'arranged',
 52014: 'rumbustious',
 52015: 'familiarness',
 52016: "spider'",
 68804: 'hahahah',
 52017: "wood'",
 40833: 'transvestism',
 34702: "hangin'",
 2338: 'bringing',
 40834: 'seamier',
 34703: 'wooded',
 52018: 'bravora',
 16817: 'grueling',
 1636: 'wooden',
 16818: 'wednesday',
 52019: "'prix",
 34704: 'altagracia',
 52020: 'circuitry',
 11585: 'crotch',
 57766: 'busybody',
 52021: "tart'n'tangy",
 14129: 'burgade',
 52023: 'thrace',
 11038: "tom's",
 52025: 'snuggles',
 29114: 'francesco',
 52027: 'complainers',
 52125: 'templarios',
 40835: '272',
 52028: '273',
 52130: 'zaniacs',

In [50]:
# From tensorflow documentation, the first 3 indices are reserved for padding, start of sequence and unknown
decoded_review = ' '.join([reverse_word_index.get(i - 3, '?') for i in sample_review])
decoded_review

"? this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert ? is an amazing actor and now the same being director ? father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for ? and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also ? to the two little boy's that played the ? of norman and paul they were just brilliant children are often left out of the ? list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done don't you th

In [51]:
from tensorflow.keras.preprocessing import sequence

## Pad the sequences to a fixed length of 500 words each sentence
features_len=500

X_train=sequence.pad_sequences(X_train,maxlen=features_len)
X_test = sequence.pad_sequences(X_test, maxlen=features_len)
X_train

array([[   0,    0,    0, ...,   19,  178,   32],
       [   0,    0,    0, ...,   16,  145,   95],
       [   0,    0,    0, ...,    7,  129,  113],
       ...,
       [   0,    0,    0, ...,    4, 3586,    2],
       [   0,    0,    0, ...,   12,    9,   23],
       [   0,    0,    0, ...,  204,  131,    9]], dtype=int32)

In [52]:
X_train[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [None]:
from tensorflow.keras.layers import Dropout, BatchNormalization, GRU

## Train Simple RNN
model=Sequential()
dimensions=128 # Number of dimensions for the embedding layer,each word will be a vector of 128 dimensions
neurons=256 # Number of neurons in the RNN layer
# Add an Input layer to explicitly define the input shape, as array of 500 words as each input
model.add(Input(shape=(features_len,)))
# It may be more accurate to use gensim to train the word embeddings, and then load the embeddings into the model
model.add(Embedding(vocabulary_size,dimensions,input_length=features_len)) ## Embedding Layers

# return_sequences=True: returns the full sequence of outputs, allowing the second SimpleRNN layer to process the entire sequence
# return_sequences is not needed for the last RNN layer, or if we just need 1 RNN layer
model.add(SimpleRNN(neurons, activation='relu'))
model.add(Dropout(0.1)) # 10% of the neurons will be dropped out randomly during training, to avoid overfitting
# model.add(GRU(neurons, activation='relu'))  # GRU layer
model.add(BatchNormalization()) # This may improve the training speed and stability

model.add(Dense(1,activation="sigmoid")) # output layer

In [53]:
model.summary()

In [54]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [55]:
## Create an instance of EarlyStoppping Callback
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
# Within 5 epochs, if the validation loss does not improve, stop the training and restore the best weights
earlystopping=EarlyStopping(monitor='val_loss',patience=8,restore_best_weights=True)
earlystopping
# If the validation loss does not improve after 5 epochs, reduce the learning rate by 20%
# reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)


<keras.src.callbacks.early_stopping.EarlyStopping at 0x7f19d0ee7e90>

In [56]:
## Train the model with early sstopping
## Using each batch of 32 samples, and find the best weights for each batch. Total we have 25000 samples / 64 = 390 batches
## Using batch would reduce memory comsumption and avoid finding the wrong descent, 
## as there could be multiple of local minimums (local descent), and we only
## want to find the global minimum (global descent)
## validation_split=0.2, means 20% of the training data will be used as validation data
history=model.fit(
    X_train,y_train,epochs=20,batch_size=64, # increase the number of epochs or batch_size will not add more training inputs
    validation_split=0.2,
    callbacks=[earlystopping]
)

Epoch 1/20
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 49ms/step - accuracy: 0.6433 - loss: 9284893016064.0000 - val_accuracy: 0.5528 - val_loss: 0.7641
Epoch 2/20
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 47ms/step - accuracy: 0.5545 - loss: 0.7657 - val_accuracy: 0.5570 - val_loss: 0.7536
Epoch 3/20
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 47ms/step - accuracy: 0.5870 - loss: 0.6599 - val_accuracy: 0.5906 - val_loss: 0.6778
Epoch 4/20
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 46ms/step - accuracy: 0.6853 - loss: 0.5469 - val_accuracy: 0.6874 - val_loss: 0.6573
Epoch 5/20
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 43ms/step - accuracy: 0.6836 - loss: 0.5368 - val_accuracy: 0.6230 - val_loss: 0.6611
Epoch 6/20
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 43ms/step - accuracy: 0.7537 - loss: 0.4778 - val_accuracy: 0.6984 - val_loss: 0.6077
Epoc

In [59]:
model.get_weights()

[array([[ 0.2565163 , -0.57055885,  0.5363279 , ..., -0.18752316,
          0.6912675 ,  0.44828653],
        [-0.02807593, -0.00598582,  0.00481732, ..., -0.0573726 ,
         -0.04008389, -0.01020903],
        [ 0.03349014, -0.07547865,  0.06802489, ...,  0.0105071 ,
          0.01539172,  0.03550284],
        ...,
        [ 0.01798494,  0.01533521,  0.03510292, ..., -0.08273786,
          0.02954749, -0.06794201],
        [-0.26360333, -0.15151593, -0.19742312, ..., -0.13173084,
          0.19870712,  0.25914222],
        [-0.2636011 ,  0.08350825, -0.2180802 , ...,  0.27248892,
         -0.4151401 , -0.4099179 ]], dtype=float32),
 array([[-0.00640702, -0.11374757, -0.08311329, ..., -0.01042442,
          0.06875041,  0.00414912],
        [-0.05087542,  0.09958373,  0.06352261, ..., -0.01644837,
          0.07035819, -0.01857858],
        [-0.10265442,  0.09974295, -0.02662472, ..., -0.09273013,
          0.0544079 , -0.04918066],
        ...,
        [-0.12559566,  0.02282128, -0.0

In [60]:
## Save model file
model.save('simple_rnn_imdb.h5')

