<a href="https://colab.research.google.com/github/theTURYA/deep-learning/blob/main/DL_001_Senrtiment_Analysis_on_IMDB_Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install kaggle



Importing the required Dependencies

In [3]:
import os
import json
from zipfile import ZipFile
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

📑**Resource:** https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Data Collection - Kaggle API

In [5]:
kaggle_dict = json.load(open("kaggle.json"))

In [6]:
kaggle_dict.keys()

dict_keys(['username', 'key'])

In [7]:
# Setup kaggle credentials as environment variables
os.environ["KAGGLE_USERNAME"] = kaggle_dict["username"]
os.environ["KAGGLE_KEY"] = kaggle_dict["key"]

In [8]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 89% 23.0M/25.7M [00:00<00:00, 74.7MB/s]
100% 25.7M/25.7M [00:00<00:00, 77.4MB/s]


In [9]:
# Unzip the dataset file
with ZipFile("/content/imdb-dataset-of-50k-movie-reviews.zip", 'r') as zip_ref :
  zip_ref.extractall()


In [10]:
!ls

'IMDB Dataset.csv'   imdb-dataset-of-50k-movie-reviews.zip   kaggle.json   sample_data


In [11]:
data = pd.read_csv('/content/IMDB Dataset.csv')
data

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [12]:
data.shape, data.size

((50000, 2), 100000)

In [13]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [14]:
data.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [15]:
data['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [16]:
data.replace({'sentiment': {'positive': 1, 'negative': 0}}, inplace = True)

In [17]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


Split the data into Training data and Test data


In [18]:
train_data, test_data = train_test_split(data, test_size = 0.2, random_state = 42)

In [19]:
train_data.shape, test_data.shape

((40000, 2), (10000, 2))

Data Pre-Processing

In [20]:
# Tokenizer text data
tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(train_data["review"])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data['review']), maxlen = 200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data['review']), maxlen = 200)

In [21]:
X_train, X_test

(array([[1935,    1, 1200, ...,  205,  351, 3856],
        [   3, 1651,  595, ...,   89,  103,    9],
        [   0,    0,    0, ...,    2,  710,   62],
        ...,
        [   0,    0,    0, ..., 1641,    2,  603],
        [   0,    0,    0, ...,  245,  103,  125],
        [   0,    0,    0, ...,   70,   73, 2062]], dtype=int32),
 array([[   0,    0,    0, ...,  995,  719,  155],
        [  12,  162,   59, ...,  380,    7,    7],
        [   0,    0,    0, ...,   50, 1088,   96],
        ...,
        [   0,    0,    0, ...,  125,  200, 3241],
        [   0,    0,    0, ..., 1066,    1, 2305],
        [   0,    0,    0, ...,    1,  332,   27]], dtype=int32))

In [22]:
y_train = train_data['sentiment']
y_test = test_data['sentiment']

In [23]:
y_train, y_test

(39087    0
 30893    0
 45278    1
 16398    0
 13653    0
         ..
 11284    1
 44732    1
 38158    0
 860      1
 15795    1
 Name: sentiment, Length: 40000, dtype: int64,
 33553    1
 9427     1
 199      0
 12447    1
 39489    0
         ..
 28567    0
 25079    1
 18707    1
 15200    0
 5857     1
 Name: sentiment, Length: 10000, dtype: int64)

**LSTM : Long Short-Term Memory**

LSTM is a type of recurrent neural network (RNN) that uses gates to capture and retain information over multiple time steps

🔗 **Resources:** https://www.tensorflow.org/api_docs/python/tf/keras/layers/LSTM


In [24]:
# Build the model

model = Sequential()
model.add(Embedding(input_dim = 5000, output_dim = 128,
                    input_length = 200))
model.add(LSTM(128, dropout = 0.2, recurrent_dropout = 0.2))
model.add(Dense(1, activation = 'sigmoid'))



In [25]:
model.summary()

In [26]:
model.compile(optimizer = 'adam', loss = 'binary_crossentropy',  metrics = ['accuracy'])

Training the model


In [27]:
model.fit(X_train, y_train, epochs = 5, batch_size = 64, validation_split= 0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m232s[0m 459ms/step - accuracy: 0.7233 - loss: 0.5328 - val_accuracy: 0.8240 - val_loss: 0.4067
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m256s[0m 447ms/step - accuracy: 0.8435 - loss: 0.3715 - val_accuracy: 0.8535 - val_loss: 0.3482
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m259s[0m 442ms/step - accuracy: 0.8714 - loss: 0.3189 - val_accuracy: 0.8621 - val_loss: 0.3319
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m264s[0m 446ms/step - accuracy: 0.8829 - loss: 0.2861 - val_accuracy: 0.8516 - val_loss: 0.3585
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m261s[0m 444ms/step - accuracy: 0.8991 - loss: 0.2607 - val_accuracy: 0.8712 - val_loss: 0.3269


<keras.src.callbacks.history.History at 0x7b4c90106920>

Model Evaluation

In [28]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}')
print(f"Test Accuracy: {accuracy}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 102ms/step - accuracy: 0.8735 - loss: 0.3203
Test Loss: 0.3159571588039398
Test Accuracy: 0.8755999803543091


**Building a Predictive System**

In [41]:
def predict_sentiment(review):
  # Tokenizer and pad the review
  sequence = tokenizer.texts_to_sequences([review])
  padded_sequence = pad_sequences(sequence, maxlen = 200)
  prediction = model.predict(padded_sequence)
  sentiment = 'positive' if prediction[0][0] > 0.5 else 'negative'
  return sentiment

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
The sentiment of the review is: negative


In [43]:
# Example usage - 1
new_review = "This movie was not that good."
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
The sentiment of the review is: negative


In [44]:
# Example usage - 2
new_review = "This movie was ok but not that good"
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step
The sentiment of the review is: negative


In [45]:
# Example usage - 3
new_review = "Good to watch and enjoying"
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
The sentiment of the review is: positive


In [47]:
# Example usage - 4
new_review = "Its called true masterpiece"
sentiment = predict_sentiment(new_review)
print(f"The sentiment of the review is: {sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
The sentiment of the review is: positive
