# Importing the Library Files

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

pandas and numpy: Used for data manipulation and numerical operations.

tensorflow: Deep learning library used to build and train the RNN model.

sklearn: Used for splitting the dataset and calculating accuracy.

In [2]:
df=pd.read_csv('/content/IMDB Dataset.csv')

df: A Pandas DataFrame that contains the IMDB movie reviews and their corresponding sentiments (positive or negative).

# Data Cleaning & Understanding the data

In [3]:
df.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df.sentiment.value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


Dataset is Balanced evenly.

# Preprocessing

In [6]:
sentences = df['review'].values
labels = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0).values

sentences: Extracts the text of the reviews.

labels: Converts the sentiment labels into binary format (1 for positive, 0 for negative).

In [9]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)

Tokenizer: Converts text into sequences of integers, where each integer represents a word in the vocabulary.

num_words=5000: Limits the tokenizer to the top 10,000 most frequent words.

fit_on_texts: Learns the vocabulary from the sentences.

texts_to_sequences: Transforms each review into a sequence of integers.

In [10]:
maxlen = 200
X = pad_sequences(sequences, maxlen=maxlen)
y = np.array(labels)

pad_sequences: Pads or truncates the sequences to ensure they all have the same length (maxlen=200), making them suitable for input into the RNN.

X: The padded sequences.

y: The corresponding labels.

#Spliting The Data

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=21)

#Building The RNN Model

In [12]:
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128))
model.add(SimpleRNN(64, return_sequences=False,kernel_regularizer=regularizers.l2(0.01)))
model.add(Dropout(0.7))
model.add(Dense(32, activation='relu',kernel_regularizer=regularizers.l2(0.01)))
model.add(Dropout(0.7))
model.add(Dense(1, activation='sigmoid'))

Sequential(): Initializes a linear stack of layers for the model.

Embedding(input_dim=5000, output_dim=128): Maps 5,000 unique words to dense 128-dimensional vectors.

SimpleRNN(64, return_sequences=False): Adds a Simple RNN layer with 64 units and applies L2 regularization to reduce overfitting.

Dropout(0.7): Drops 70% of the neurons randomly during training to prevent overfitting.

Dense(32, activation='relu'): Adds a fully connected layer with 32 units, ReLU activation, and L2 regularization.

Dense(1, activation='sigmoid'): Adds an output layer with 1 unit for binary classification using sigmoid activation.

In [13]:
model.compile(optimizer=Adam(learning_rate=0.0001),loss='binary_crossentropy', metrics=['accuracy'])

optimizer=Adam(learning_rate=0.0001): Uses the Adam optimizer with a learning rate of 0.0001 for efficient gradient descent.

loss='binary_crossentropy': Loss function for binary classification.

metrics=['accuracy']: Tracks the accuracy during training and evaluation.

#Training The Model

In [14]:
early_stopping = EarlyStopping(monitor='val_loss',
                               patience=3, restore_best_weights=True)

history = model.fit(X_train, y_train,
                    epochs=10, batch_size=128,
                    validation_data=(X_test, y_test),
                    callbacks=[early_stopping])

Epoch 1/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 143ms/step - accuracy: 0.5013 - loss: 1.8844 - val_accuracy: 0.5124 - val_loss: 1.5947
Epoch 2/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 137ms/step - accuracy: 0.5059 - loss: 1.5252 - val_accuracy: 0.5229 - val_loss: 1.3132
Epoch 3/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 137ms/step - accuracy: 0.5124 - loss: 1.2585 - val_accuracy: 0.6225 - val_loss: 1.0900
Epoch 4/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 137ms/step - accuracy: 0.5412 - loss: 1.0686 - val_accuracy: 0.5361 - val_loss: 0.9944
Epoch 5/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 144ms/step - accuracy: 0.5362 - loss: 0.9705 - val_accuracy: 0.6238 - val_loss: 0.8975
Epoch 6/10
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 136ms/step - accuracy: 0.6755 - loss: 0.8398 - val_accuracy: 0.8066 - val_loss: 0.6390
Epoch 7/10

EarlyStopping(monitor='val_loss'): Monitors the validation loss during training and stops if it doesn't improve.

patience=3: Stops training if the validation loss doesn't improve for 3 consecutive epochs.

fit: Trains the model using the training data. The validation data is used to evaluate the model during training.

epochs=10: The model will go through the entire dataset 10 times.

batch_size=128: The number of samples processed before the model is updated.

#Evaluating the Model

In [15]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.4f}')

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.8544 - loss: 0.4305
Test Accuracy: 0.8565


evaluate: Computes the loss and accuracy on the test data.

#Making Prediction

In [17]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(f'Accuracy Score: {accuracy_score(y_test, y_pred):.4f}')

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step
Accuracy Score: 0.8565


predict: Generates predictions for the test data.

y_pred: Converts probabilities to binary predictions (0 or 1).

accuracy_score: Computes the accuracy between the true labels and the predicted labels.