## End to end Deep Learning Project Using Simple RNN

In [50]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,SimpleRNN,Dense, Input
import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0' # disable onednn for Nvidia training only

In [51]:
import requests
import zipfile
import os

# Check if the file is extracted
if os.path.exists("glove.6B.100d.txt"):
    print("GloVe embeddings is already downloaded and extracted.")
else:
    print("Downloading GloVe 6B 100-dim. It may take 30 mins to an hour ...")
    # Download the GloVe embeddings
    url = "http://nlp.stanford.edu/data/glove.6B.zip"
    response = requests.get(url)

    # Save the zip file
    with open("glove.6B.zip", "wb") as f:
        f.write(response.content)

    # Unzip the file
    with zipfile.ZipFile("glove.6B.zip", "r") as zip_ref:
        zip_ref.extractall()


# Check if the file is extracted
if os.path.exists("glove.6B.100d.txt"):
    print("GloVe embeddings downloaded and extracted successfully.")
else:
    print("Failed to download or extract GloVe embeddings.")

# Load GloVe embeddings
embedding_index = {}
with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

# Determine the vocabulary size from GloVe
vocabulary_size = len(embedding_index)
vocabulary_size

GloVe embeddings is already downloaded and extracted.
GloVe embeddings downloaded and extracted successfully.


400000

In [52]:
## Load the imdb dataset
(X_train,y_train),(X_test,y_test)=imdb.load_data(num_words=vocabulary_size)

# Print the shape of the data
print(f'Training data shape: {X_train.shape}, Training labels shape: {y_train.shape}')
print(f'Testing data shape: {X_train.shape}, Testing labels shape: {y_test.shape}')

Training data shape: (25000,), Training labels shape: (25000,)
Testing data shape: (25000,), Testing labels shape: (25000,)


In [53]:
X_train[0],y_train[0]

([1,
  14,
  22,
  16,
  43,
  530,
  973,
  1622,
  1385,
  65,
  458,
  4468,
  66,
  3941,
  4,
  173,
  36,
  256,
  5,
  25,
  100,
  43,
  838,
  112,
  50,
  670,
  22665,
  9,
  35,
  480,
  284,
  5,
  150,
  4,
  172,
  112,
  167,
  21631,
  336,
  385,
  39,
  4,
  172,
  4536,
  1111,
  17,
  546,
  38,
  13,
  447,
  4,
  192,
  50,
  16,
  6,
  147,
  2025,
  19,
  14,
  22,
  4,
  1920,
  4613,
  469,
  4,
  22,
  71,
  87,
  12,
  16,
  43,
  530,
  38,
  76,
  15,
  13,
  1247,
  4,
  22,
  17,
  515,
  17,
  12,
  16,
  626,
  18,
  19193,
  5,
  62,
  386,
  12,
  8,
  316,
  8,
  106,
  5,
  4,
  2223,
  5244,
  16,
  480,
  66,
  3785,
  33,
  4,
  130,
  12,
  16,
  38,
  619,
  5,
  25,
  124,
  51,
  36,
  135,
  48,
  25,
  1415,
  33,
  6,
  22,
  12,
  215,
  28,
  77,
  52,
  5,
  14,
  407,
  16,
  82,
  10311,
  8,
  4,
  107,
  117,
  5952,
  15,
  256,
  4,
  31050,
  7,
  3766,
  5,
  723,
  36,
  71,
  43,
  530,
  476,
  26,
  400,
  317,
  46,
  7,


In [54]:
## Inspect a sample review and its label
sample_review=X_train[0]
sample_label=y_train[0]

print(f"Sample review (as integers):{sample_review}")
print(f'Sample label: {sample_label}')


Sample review (as integers):[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
Sample l

In [55]:
### MApping of words index bacl to words(for understanding)
word_index=imdb.get_word_index()
#reverse the key to become value, and value to become key in the new dictionary
reverse_word_index = {value: key for key, value in word_index.items()}
reverse_word_index

{34701: 'fawn',
 52006: 'tsukino',
 52007: 'nunnery',
 16816: 'sonja',
 63951: 'vani',
 1408: 'woods',
 16115: 'spiders',
 2345: 'hanging',
 2289: 'woody',
 52008: 'trawling',
 52009: "hold's",
 11307: 'comically',
 40830: 'localized',
 30568: 'disobeying',
 52010: "'royale",
 40831: "harpo's",
 52011: 'canet',
 19313: 'aileen',
 52012: 'acurately',
 52013: "diplomat's",
 25242: 'rickman',
 6746: 'arranged',
 52014: 'rumbustious',
 52015: 'familiarness',
 52016: "spider'",
 68804: 'hahahah',
 52017: "wood'",
 40833: 'transvestism',
 34702: "hangin'",
 2338: 'bringing',
 40834: 'seamier',
 34703: 'wooded',
 52018: 'bravora',
 16817: 'grueling',
 1636: 'wooden',
 16818: 'wednesday',
 52019: "'prix",
 34704: 'altagracia',
 52020: 'circuitry',
 11585: 'crotch',
 57766: 'busybody',
 52021: "tart'n'tangy",
 14129: 'burgade',
 52023: 'thrace',
 11038: "tom's",
 52025: 'snuggles',
 29114: 'francesco',
 52027: 'complainers',
 52125: 'templarios',
 40835: '272',
 52028: '273',
 52130: 'zaniacs',

In [56]:
# From tensorflow documentation, the first 3 indices are reserved for padding, start of sequence and unknown
decoded_review = ' '.join([reverse_word_index.get(i - 3, '?') for i in sample_review])
decoded_review

"? this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert redford's is an amazing actor and now the same being director norman's father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for retail and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also congratulations to the two little boy's that played the part's of norman and paul they were just brilliant children are often left out of the praising list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should b

In [57]:
from tensorflow.keras.preprocessing import sequence

## Pad the sequences to a fixed length of 500 words each sentence
features_len=500

X_train=sequence.pad_sequences(X_train,maxlen=features_len)
X_test = sequence.pad_sequences(X_test, maxlen=features_len)
X_train

array([[    0,     0,     0, ...,    19,   178,    32],
       [    0,     0,     0, ...,    16,   145,    95],
       [    0,     0,     0, ...,     7,   129,   113],
       ...,
       [    0,     0,     0, ...,     4,  3586, 22459],
       [    0,     0,     0, ...,    12,     9,    23],
       [    0,     0,     0, ...,   204,   131,     9]], dtype=int32)

In [58]:
X_train[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

In [59]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Embedding, SimpleRNN, GRU, Dropout, BatchNormalization, Dense

# Parameters
neurons = 256  # Number of neurons in the RNN layer
embedding_dim = 100  # Dimension of the GloVe embeddings, exactly 100-dim as per 'glove.6B.100d.txt'

# Create an embedding matrix
embedding_matrix = np.zeros((vocabulary_size, embedding_dim))
word_index = {word: idx for idx, (word, _) in enumerate(embedding_index.items())}  # Create a word-to-index mapping

for word, i in word_index.items():
    if i < vocabulary_size:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# Build the model
model = Sequential()
model.add(Input(shape=(features_len,)))
model.add(Embedding(vocabulary_size, embedding_dim, weights=[embedding_matrix], input_length=features_len, trainable=False))  # Embedding Layer with pre-trained GloVe embeddings

model.add(SimpleRNN(neurons, activation='relu'))
model.add(Dropout(0.1))  # 10% of the neurons will be dropped out randomly during training, to avoid overfitting
model.add(BatchNormalization())  # This may improve the training speed and stability

model.add(Dense(1, activation="sigmoid"))  # Output layer

In [60]:
model.summary()

In [61]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [62]:
## Create an instance of EarlyStoppping Callback
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
# Within 5 epochs, if the validation loss does not improve, stop the training and restore the best weights
earlystopping=EarlyStopping(monitor='val_loss',patience=8,restore_best_weights=True)
earlystopping
# If the validation loss does not improve after 5 epochs, reduce the learning rate by 20%
# reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.001)


<keras.src.callbacks.early_stopping.EarlyStopping at 0x78322c2fbe30>

In [63]:
## Train the model with early sstopping
## Using each batch of 32 samples, and find the best weights for each batch. Total we have 25000 samples / 64 = 390 batches
## Using batch would reduce memory comsumption and avoid finding the wrong descent, 
## as there could be multiple of local minimums (local descent), and we only
## want to find the global minimum (global descent)
## validation_split=0.2, means 20% of the training data will be used as validation data
history=model.fit(
    X_train,y_train,epochs=20,batch_size=128, # increase the number of epochs or batch_size will not add more training inputs
    validation_split=0.2,
    callbacks=[earlystopping]
)

Epoch 1/20
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 56ms/step - accuracy: 0.5355 - loss: 0.7778 - val_accuracy: 0.5336 - val_loss: 0.7059
Epoch 2/20
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 48ms/step - accuracy: 0.5962 - loss: 0.6644 - val_accuracy: 0.5446 - val_loss: 0.7267
Epoch 3/20
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 48ms/step - accuracy: 0.6522 - loss: 0.6139 - val_accuracy: 0.6258 - val_loss: 0.6313
Epoch 4/20
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 48ms/step - accuracy: 0.7024 - loss: 0.5581 - val_accuracy: 0.6402 - val_loss: 0.6247
Epoch 5/20
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 48ms/step - accuracy: 0.7326 - loss: 0.5218 - val_accuracy: 0.5660 - val_loss: 1.0005
Epoch 6/20
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 47ms/step - accuracy: 0.7718 - loss: 0.4719 - val_accuracy: 0.6754 - val_loss: 0.6373
Epoch 7/20
[1m157/15

In [64]:
model.get_weights()

[array([[-0.038194, -0.24487 ,  0.72812 , ..., -0.1459  ,  0.8278  ,
          0.27062 ],
        [-0.10767 ,  0.11053 ,  0.59812 , ..., -0.83155 ,  0.45293 ,
          0.082577],
        [-0.33979 ,  0.20941 ,  0.46348 , ..., -0.23394 ,  0.47298 ,
         -0.028803],
        ...,
        [ 0.36088 , -0.16919 , -0.32704 , ...,  0.27139 , -0.29188 ,
          0.16109 ],
        [-0.10461 , -0.5047  , -0.49331 , ...,  0.42527 , -0.5125  ,
         -0.17054 ],
        [ 0.28365 , -0.6263  , -0.44351 , ...,  0.43678 , -0.82607 ,
         -0.15701 ]], dtype=float32),
 array([[-0.02813263, -0.0177936 , -0.01759675, ..., -0.02882946,
          0.10573576,  0.09064171],
        [-0.0704998 , -0.02397074, -0.11444536, ...,  0.00479393,
          0.00966559,  0.04764821],
        [ 0.05483513,  0.10239789,  0.10863517, ..., -0.01021458,
          0.00336485,  0.09119852],
        ...,
        [-0.05855448, -0.01211986,  0.02046069, ..., -0.11447552,
          0.04384724,  0.05933198],
        [

In [65]:
## Save model file
model.save('glove_rnn_imdb.h5')

