<a href="https://colab.research.google.com/github/zrtashi/Sentiment-Analysis-Using-LSTM/blob/main/IMDB_Review.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Movie Review Sentiment Analysis Using LSTM**

In [1]:
!pip install kaggle



In [None]:
# Import necessary libraries
import os  # For interacting with the operating system (e.g., file handling)
import json  # To handle JSON data
from zipfile import ZipFile  # To work with zip files (e.g., extracting data)
import pandas as pd  # To manipulate data in DataFrames (from the pandas library)

# Importing modules from Scikit-learn for splitting datasets
from sklearn.model_selection import train_test_split  # For splitting data into training and testing sets

# Importing modules from TensorFlow for building and training neural networks
from tensorflow.keras.models import Sequential  # Sequential model for building a linear stack of layers
from tensorflow.keras.layers import Dense, Embedding, LSTM  # Layers for neural networks: Dense (fully connected), Embedding (word embeddings), LSTM (for sequential data like text)

# Importing modules for preprocessing text data
from tensorflow.keras.preprocessing.text import Tokenizer  # Tokenizer to convert text into sequences of integers
from tensorflow.keras.preprocessing.sequence import pad_sequences  # To pad sequences to the same length (ensuring uniform input size for the model)


In [73]:
# Create a hidden directory named ".kaggle" in the home directory, if it doesn't already exist
!mkdir -p ~/.kaggle

# Copy the "kaggle.json" file, which contains your Kaggle API credentials, to the ".kaggle" directory
!cp kaggle.json ~/.kaggle/


In [74]:
# Open the 'kaggle.json' file, which contains the Kaggle API credentials, and load it as a Python dictionary
kaggle_dict = json.load(open('kaggle.json'))


In [75]:
kaggle_dict.keys()

dict_keys(['username', 'key'])

In [76]:
# Set the Kaggle username and API key as environment variables
# This allows secure access to the Kaggle API using the credentials stored in 'kaggle_dict'

os.environ['KAGGLE_USERNAME'] = kaggle_dict['username']  # Set the 'KAGGLE_USERNAME' environment variable
os.environ['KAGGLE_KEY'] = kaggle_dict['key']  # Set the 'KAGGLE_KEY' environment variable


In [77]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 70% 18.0M/25.7M [00:00<00:00, 184MB/s]
100% 25.7M/25.7M [00:00<00:00, 188MB/s]


In [78]:
!ls

imdb-dataset-of-50k-movie-reviews.zip  kaggle.json  sample_data


# **Data Preprocessing**

In [79]:
# Import the zipfile module to work with zip files
import zipfile

# Open the zip file located at the specified path ('/content/imdb-dataset-of-50k-movie-reviews.zip') in read mode ('r')
zip = zipfile.ZipFile('/content/imdb-dataset-of-50k-movie-reviews.zip', 'r')

# Extract all the contents of the zip file to the specified directory ('/content')
zip.extractall('/content')

# Close the zip file after extraction to free up resources
zip.close()


In [80]:
!ls

'IMDB Dataset.csv'   imdb-dataset-of-50k-movie-reviews.zip   kaggle.json   sample_data


In [81]:
df = pd.read_csv('/content/IMDB Dataset.csv')

In [82]:
df.shape

(50000, 2)

In [83]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [84]:
df.tail()

Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [85]:
df["sentiment"].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [86]:
# Replace the values in the 'sentiment' column of the DataFrame 'df'
# The dictionary {"positive": 1, "negative": 0} maps 'positive' to 1 and 'negative' to 0
# 'inplace=True' modifies the original DataFrame without creating a copy

df.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)


In [87]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [88]:
df.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


In [89]:
df["sentiment"].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
1,25000
0,25000


# **Model Construction**

In [90]:
# Split the DataFrame 'df' into training and testing sets
# 'test_size=0.2' reserves 20% of the data for testing, while the remaining 80% is used for training
# 'random_state=42' ensures reproducibility of the split by using a fixed random seed

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)


In [91]:
train_data.shape

(40000, 2)

In [92]:
test_data.shape

(10000, 2)

In [93]:
# Initialize the Tokenizer with a limit of 5000 words (the most frequent words will be kept)
tokenizer = Tokenizer(num_words=5000)

# Fit the tokenizer on the 'review' column from the training data
# This builds the word index based on the training data text
tokenizer.fit_on_texts(train_data["review"])

# Convert the training 'review' text into sequences of integers (based on the tokenizer's word index)
# Then pad the sequences so that all sequences have a maximum length of 200 words
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=200)

# Similarly, convert and pad the test 'review' text for testing data
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=200)


In [94]:
print(X_train)

[[1935    1 1200 ...  205  351 3856]
 [   3 1651  595 ...   89  103    9]
 [   0    0    0 ...    2  710   62]
 ...
 [   0    0    0 ... 1641    2  603]
 [   0    0    0 ...  245  103  125]
 [   0    0    0 ...   70   73 2062]]


In [95]:
print(X_test)

[[   0    0    0 ...  995  719  155]
 [  12  162   59 ...  380    7    7]
 [   0    0    0 ...   50 1088   96]
 ...
 [   0    0    0 ...  125  200 3241]
 [   0    0    0 ... 1066    1 2305]
 [   0    0    0 ...    1  332   27]]


In [96]:
# Extract the 'sentiment' column from the training data and assign it to 'y_train'
# This serves as the target labels for the training set
y_train = train_data["sentiment"]

# Extract the 'sentiment' column from the test data and assign it to 'y_test'
# This serves as the target labels for the test set
y_test = test_data["sentiment"]


In [97]:
print(y_train)


39087    0
30893    0
45278    1
16398    0
13653    0
        ..
11284    1
44732    1
38158    0
860      1
15795    1
Name: sentiment, Length: 40000, dtype: int64


In [98]:
print(y_test)

33553    1
9427     1
199      0
12447    1
39489    0
        ..
28567    0
25079    1
18707    1
15200    0
5857     1
Name: sentiment, Length: 10000, dtype: int64


In [101]:
# Build a Sequential model (a linear stack of layers)
model = Sequential()

# Add an Embedding layer that converts word indices into dense vectors of fixed size
# 'input_dim=5000' is the vocabulary size (top 5000 words), 'output_dim=128' is the size of each word vector
# 'input_length=200' specifies that each input sequence will have 200 words
model.add(Embedding(input_dim=5000, output_dim=128, input_length=200))

# Add an LSTM (Long Short-Term Memory) layer with 128 units
# 'dropout=0.2' drops 20% of input units during training to prevent overfitting
# 'recurrent_dropout=0.2' drops 20% of recurrent connections (in the LSTM) for the same purpose
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))

# Add a Dense (fully connected) layer with 1 unit, which is used for binary classification
# 'activation='sigmoid'' is used to output probabilities between 0 and 1 (ideal for binary classification)
model.add(Dense(1, activation='sigmoid'))

# Fix: Explicitly build the model with the specified input shape before calling summary
# 'input_shape=(None, 200)' means the model expects input batches with sequences of length 200
model.build(input_shape=(None, 200))

# Print the model's architecture summary, displaying each layer, its output shape, and the number of parameters
model.summary()


# **Training and Evaluation**

In [103]:
# Compile the model by specifying the loss function, optimizer, and evaluation metrics
model.compile(
    loss='binary_crossentropy',  # Loss function for binary classification problems (two output classes: 0 or 1)
    optimizer='adam',            # Adam optimizer, an efficient variant of gradient descent that adapts the learning rate
    metrics=['accuracy']         # Accuracy will be tracked as the metric to evaluate the model's performance during training and testing
)


In [None]:
# Train the model on the training data
model.fit(
    X_train,                    # Feature data for training (input data)
    y_train,                    # Target labels for training (output data)
    batch_size=64,              # Number of samples per gradient update (smaller batch size = more updates)
    epochs=5,                   # Number of complete passes through the training dataset (5 training iterations)
    validation_data=(X_test, y_test)  # Data for validation (used to evaluate the model during training, but not for gradient updates)
)


Epoch 1/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m184s[0m 282ms/step - accuracy: 0.7452 - loss: 0.4974 - val_accuracy: 0.8676 - val_loss: 0.3265
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 281ms/step - accuracy: 0.8273 - loss: 0.3943 - val_accuracy: 0.7929 - val_loss: 0.4378
Epoch 3/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 274ms/step - accuracy: 0.8476 - loss: 0.3549 - val_accuracy: 0.7064 - val_loss: 0.5533
Epoch 4/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 258ms/step - accuracy: 0.8360 - loss: 0.3751 - val_accuracy: 0.8785 - val_loss: 0.3036
Epoch 5/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 260ms/step - accuracy: 0.8849 - loss: 0.2906 - val_accuracy: 0.8838 - val_loss: 0.2879


<keras.src.callbacks.history.History at 0x79630515ad70>

In [105]:
# Evaluate the model on the test data (X_test, y_test)
loss, accuracy = model.evaluate(X_test, y_test)  # Returns the loss value and accuracy on the test dataset

# Print the test loss and accuracy
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")  # Display the evaluation results


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 88ms/step - accuracy: 0.8849 - loss: 0.2865
Test Loss: 0.28788819909095764, Test Accuracy: 0.8838000297546387


# **Prediction**

In [106]:
def predict_sentiment(review):
    # Convert the input text (review) into a sequence of integers using the tokenizer
    sequence = tokenizer.texts_to_sequences([review])

    # Pad the sequence to ensure it has a fixed length (maxlen=200) to match the input shape expected by the model
    padded_sequence = pad_sequences(sequence, maxlen=200)

    # Use the model to predict the sentiment for the padded sequence
    prediction = model.predict(padded_sequence)

    # If the prediction value is greater than 0.5, classify as "positive", otherwise "negative"
    sentiment = "positive" if prediction > 0.5 else "negative"

    # Return the predicted sentiment
    return sentiment



In [107]:
# Define a new movie review
new_review = "The cinematography was stunning, and the performances were top-notch!"

# Predict the sentiment of the new review using the predict_sentiment function
predicted_sentiment = predict_sentiment(new_review)

# Print the predicted sentiment ("positive" or "negative")
print(f"Predicted Sentiment: {predicted_sentiment}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 261ms/step
Predicted Sentiment: positive


In [108]:
# Define a new movie review with negative sentiment
new_review = "The storyline was dull, and the characters were poorly developed."

# Predict the sentiment of the new review using the predict_sentiment function
predicted_sentiment = predict_sentiment(new_review)

# Print the predicted sentiment ("positive" or "negative")
print(f"Predicted Sentiment: {predicted_sentiment}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 165ms/step
Predicted Sentiment: negative
