# IMDB Dataset
It uses the IMDB dataset that contains the text of 50,000 movie reviews from the Internet Movie Database. These are split into 25,000 reviews for training and 25,000 reviews for testing. The training and testing sets are balanced, meaning they contain an equal number of positive and negative reviews.

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, SimpleRNN

In [2]:
max_features = 10000 # vocabulary Size
(X_train,y_train), (X_test, y_test) = imdb.load_data(num_words=max_features) # Loading the data

In [3]:
print(" X_train Data: ", X_train.shape)
print(" y_train Data: ", y_train.shape)
print(" X_test Data: ", X_test.shape)
print(" y_test Data: ", y_test.shape)

 X_train Data:  (25000,)
 y_train Data:  (25000,)
 X_test Data:  (25000,)
 y_test Data:  (25000,)


In [4]:
y_train

array([1, 0, 0, ..., 0, 1, 0])

In [5]:
X_train

array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]),
       list([1, 194, 1153, 194, 8255, 78, 228,

In [6]:
y_train[0]

1

In [7]:
# All the sentences should be of same length in order to train the model

max_length = 500
X_train = sequence.pad_sequences(X_train, maxlen=max_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_length)


In [8]:
X_train[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [9]:
model = Sequential()
model.add(Embedding(max_features, 128, input_length=max_length)) # Embedding Layers
model.add(SimpleRNN(32, activation='relu'))
model.add(Dense(1, activation='sigmoid')) # Output layer activation -> Sigmoid



In [10]:
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
early_stopping

<keras.src.callbacks.early_stopping.EarlyStopping at 0x7cfd12248670>

In [11]:
model.summary()

In [12]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [13]:
history = model.fit(X_train, y_train,
          epochs=10, batch_size=100,
          validation_split=0.2,
          callbacks=[early_stopping])

Epoch 1/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 282ms/step - accuracy: 0.5770 - loss: 0.6689 - val_accuracy: 0.7388 - val_loss: 0.5260
Epoch 2/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 268ms/step - accuracy: 0.8183 - loss: 0.4026 - val_accuracy: 0.8250 - val_loss: 0.3897
Epoch 3/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 265ms/step - accuracy: 0.8948 - loss: 0.2697 - val_accuracy: 0.8530 - val_loss: 0.4096
Epoch 4/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 258ms/step - accuracy: 0.8586 - loss: 0.3374 - val_accuracy: 0.8230 - val_loss: 0.4019
Epoch 5/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 253ms/step - accuracy: 0.9230 - loss: 0.1973 - val_accuracy: 0.8524 - val_loss: 0.3867
Epoch 6/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 236ms/step - accuracy: 0.9338 - loss: 0.1718 - val_accuracy: 0.8208 - val_loss: 0.4212
Epoch 7/10

# Prediction

In [14]:
# Model Weights
model.get_weights()[0]

array([[-0.03805197, -0.02235755,  0.02228696, ...,  0.02400256,
        -0.08912589, -0.02478707],
       [-0.15482818,  0.02299205, -0.02121468, ...,  0.09543838,
        -0.07670856, -0.02833601],
       [ 0.04224421, -0.00067454,  0.0068787 , ...,  0.01249277,
        -0.00277869, -0.00974933],
       ...,
       [ 0.02489753, -0.07018739,  0.09342954, ...,  0.02393487,
         0.02476777, -0.06971567],
       [ 0.02024143, -0.00492115,  0.00180738, ..., -0.05926957,
        -0.0120878 ,  0.03630434],
       [ 0.09818715, -0.07478099,  0.09463334, ..., -0.05697802,
         0.02453457, -0.04689755]], dtype=float32)

In [15]:
word_index = imdb.get_word_index()
reversed_words = {value: key for (key, value) in word_index.items()}

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [16]:
reversed_words

{34701: 'fawn',
 52006: 'tsukino',
 52007: 'nunnery',
 16816: 'sonja',
 63951: 'vani',
 1408: 'woods',
 16115: 'spiders',
 2345: 'hanging',
 2289: 'woody',
 52008: 'trawling',
 52009: "hold's",
 11307: 'comically',
 40830: 'localized',
 30568: 'disobeying',
 52010: "'royale",
 40831: "harpo's",
 52011: 'canet',
 19313: 'aileen',
 52012: 'acurately',
 52013: "diplomat's",
 25242: 'rickman',
 6746: 'arranged',
 52014: 'rumbustious',
 52015: 'familiarness',
 52016: "spider'",
 68804: 'hahahah',
 52017: "wood'",
 40833: 'transvestism',
 34702: "hangin'",
 2338: 'bringing',
 40834: 'seamier',
 34703: 'wooded',
 52018: 'bravora',
 16817: 'grueling',
 1636: 'wooden',
 16818: 'wednesday',
 52019: "'prix",
 34704: 'altagracia',
 52020: 'circuitry',
 11585: 'crotch',
 57766: 'busybody',
 52021: "tart'n'tangy",
 14129: 'burgade',
 52023: 'thrace',
 11038: "tom's",
 52025: 'snuggles',
 29114: 'francesco',
 52027: 'complainers',
 52125: 'templarios',
 40835: '272',
 52028: '273',
 52130: 'zaniacs',

In [17]:
def decoded_review(encoded_review):
    return ' '.join([reversed_words.get(i-3, '?') for i in encoded_review])

def preprocess_reviews(text:str):
  words = text.lower().split()
  encoded_reviews = [word_index.get(word, 2) + 3 for word in words]
  padded_reviews  = sequence.pad_sequences([encoded_reviews], maxlen=max_length)
  return padded_reviews


In [18]:
def predict_sentiment(review):
  sentiment = ''
  pre_processed_input = preprocess_reviews(review)
  prediction = model.predict(pre_processed_input)

  if prediction[0][0] > 0.5:
    sentiment = 'Review is positive'
  else:
    sentiment = 'Review is negative'

  return sentiment , prediction[0][0]

In [24]:
sample_review_for_testing = 'I love the movie.'
predict_sentiment(sample_review_for_testing)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step


('Review is positive', 0.9372261)