<a href="https://colab.research.google.com/github/whoisammmmar/Movie-Review-Sentiment-Analyzer/blob/main/Untitled9.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Movie Review Sentiment Analyzer

In [1]:
import pandas as pd
import numpy as np
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [3]:
import zipfile
import os

zip_file_path = '/content/IMDB Dataset.csv.zip'
extracted_dir_path = '/content/'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_dir_path)

# Verify the extracted file
extracted_file_path = os.path.join(extracted_dir_path, 'IMDB Dataset.csv')
if os.path.exists(extracted_file_path):
    print(f"File extracted successfully to: {extracted_file_path}")
else:
    print("File extraction failed.")

File extracted successfully to: /content/IMDB Dataset.csv


In [4]:
data = pd.read_csv('/content/IMDB Dataset.csv')
print(data.head())
data.head()
data.tail()

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


Unnamed: 0,review,sentiment
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative
49999,No one expects the Star Trek movies to be high...,negative


In [5]:
def clean_text(text):
    text = text.lower()
    #remove HTML tags
    text = re.sub('<.*?>', '', text)
    #remove special characters
    text = re.sub('[^a-zA-Z]', ' ', text)
     #remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    return text.strip()
data['clean_review'] = data['review'].apply(clean_text)

In [11]:
#Prepare data for LSTM
X = data['clean_review']
y = data['sentiment'].map({'positive': 1, 'negative': 0})

#Tokenization
max_words = 10000
max_len = 200

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=max_len)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)

In [8]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))



In [9]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [15]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')

y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(classification_report(y_test, y_pred))

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 85ms/step - accuracy: 0.5163 - loss: 0.6930
Test Loss: 0.6929
Test Accuracy: 0.5143
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 76ms/step
              precision    recall  f1-score   support

           0       0.51      0.39      0.44      4961
           1       0.51      0.64      0.57      5039

    accuracy                           0.51     10000
   macro avg       0.51      0.51      0.51     10000
weighted avg       0.51      0.51      0.51     10000



In [16]:
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 78ms/step
Accuracy: 0.5143
              precision    recall  f1-score   support

           0       0.51      0.39      0.44      4961
           1       0.51      0.64      0.57      5039

    accuracy                           0.51     10000
   macro avg       0.51      0.51      0.51     10000
weighted avg       0.51      0.51      0.51     10000



In [17]:
while True:
    user_input = input("Enter a movie review (or type 'exit' to quit):\n")
    if user_input.lower() == 'exit':
        break

    # Clean and preprocess input
    cleaned_input = clean_text(user_input)
    input_seq = tokenizer.texts_to_sequences([cleaned_input])
    input_pad = pad_sequences(input_seq, maxlen=max_len)

    # Predict
    prediction = model.predict(input_pad)[0][0]
    sentiment = "Positive" if prediction > 0.5 else "Negative"

    print(f"\nPredicted Sentiment: {sentiment}")
    print(f"Confidence: {prediction:.2f}\n")

Enter a movie review (or type 'exit' to quit):
exit


#We can see that our model is working fine making accurate predictions.