<a href="https://colab.research.google.com/github/vaish1024/Movie_Review_Sentiment_Analysis/blob/main/movie_review_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q nltk scikit-learn tensorflow tensorflow-datasets

import re
import string
import nltk
import numpy as np
import tensorflow_datasets as tfds

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

nltk.download("stopwords")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
def handle_negations(text):
    # Combine negation words with the word that follows (not good → not_good)
    negation_words = {"not", "no", "never", "n't"}
    words = text.split()
    result = []
    skip = False
    for i in range(len(words)):
        if skip:
            skip = False
            continue
        if words[i] in negation_words and i + 1 < len(words):
            result.append(words[i] + '_' + words[i + 1])
            skip = True
        else:
            result.append(words[i])
    return ' '.join(result)

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^a-zA-Z\s]", " ", text)
    text = text.strip()
    text = handle_negations(text)

    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]

    return ' '.join(words)


In [None]:
# Load IMDb dataset
train_ds, test_ds = tfds.load("imdb_reviews", split=["train", "test"], as_supervised=True)

train_texts, train_labels = [], []
for text, label in tfds.as_numpy(train_ds):
    train_texts.append(preprocess_text(text.decode("utf-8")))
    train_labels.append(label)

test_texts, test_labels = [], []
for text, label in tfds.as_numpy(test_ds):
    test_texts.append(preprocess_text(text.decode("utf-8")))
    test_labels.append(label)




Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.MIPTKZ_1.0.0/imdb_reviews-train.tfrecor…

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.MIPTKZ_1.0.0/imdb_reviews-test.tfrecord…

Generating unsupervised examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.MIPTKZ_1.0.0/imdb_reviews-unsupervised.…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, train_labels)


In [None]:
y_pred = model.predict(X_test)
print(classification_report(test_labels, y_pred))


              precision    recall  f1-score   support

           0       0.88      0.88      0.88     12500
           1       0.88      0.88      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000



In [None]:
# Classify a user input review

def predict_sentiment(review):
    review_clean = preprocess_text(review)
    review_vector = vectorizer.transform([review_clean])
    prediction = model.predict(review_vector)[0]
    sentiment = "Positive 😊" if prediction == 1 else "Negative 😞"
    return sentiment

# Example usage
user_review = input("Enter a movie review: ")
result = predict_sentiment(user_review)
print(f"Sentiment: {result}")

'''
Example: It was a good movie(positive)
It was not good(negative)
Awesome movie(positive)
Waste of time (negative)
fantastic movie(positive)
Worst movie ever(negative)
Absolutely loved it(positive)
I hated the movie so much(negative)
Awesome....Loved it(Positive)
'''



Enter a movie review: awesome....Loved it
Sentiment: Positive 😊


'\nExample: It was a good movie(positive)\nIt was not good(negative)\nAwesome movie(positive)\nWaste of time (negative)\nfantastic movie(positive)\nWorst movie ever(negative)\nAbsolutely loved it(positive)\nI hated the movie so much(negative)\n'