In [1]:
# Import basic libraries for data handling
import pandas as pd
import numpy as np

# Regular expression library for text cleaning
import re

# Libraries for model building and evaluation
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report


In [3]:
# Load the dataset containing movie reviews
df = pd.read_csv('IMDB Dataset.csv')

# Display first few rows of the dataset
df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
# Check dataset information such as column names and data types
df.info()

# Check number of positive and negative reviews
df['sentiment'].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [5]:
# Convert sentiment labels to numeric values
# positive -> 1
# negative -> 0
df['sentiment'] = df['sentiment'].map({
    'positive': 1,
    'negative': 0
})


In [6]:
# Function to clean text data
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)

    return text


In [7]:
# Apply text cleaning to all reviews
df['clean_review'] = df['review'].apply(clean_text)


In [8]:
# X contains cleaned text reviews
X = df['clean_review']

# y contains sentiment labels
y = df['sentiment']


In [9]:
# Split data into training and testing sets
# 80% data for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)


In [10]:
# TF-IDF converts text data into numerical vectors
# stop_words removes common words like 'the', 'is', etc.
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)

# Fit TF-IDF on training data and transform both train and test data
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [11]:
# Create Logistic Regression model
# max_iter is increased for better convergence
lr_model = LogisticRegression(max_iter=1000)

# Train the model using training data
lr_model.fit(X_train_tfidf, y_train)


In [12]:
# Predict sentiment for test data
y_pred = lr_model.predict(X_test_tfidf)

# Display confusion matrix
print(confusion_matrix(y_test, y_pred))

# Display precision, recall and F1-score
print(classification_report(y_test, y_pred))

[[4296  665]
 [ 500 4539]]
              precision    recall  f1-score   support

           0       0.90      0.87      0.88      4961
           1       0.87      0.90      0.89      5039

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



In [13]:
# Function to predict sentiment of custom text input
def predict_sentiment(text):
    cleaned_text = clean_text(text)
    vector = tfidf.transform([cleaned_text])
    prediction = lr_model.predict(vector)

    if prediction[0] == 1:
        return "Positive Review"
    else:
        return "Negative Review"


In [16]:
# Test the model on custom inputs

print(predict_sentiment("The movie was amazing and I really enjoyed it"))
print(predict_sentiment("The movie was boring and not worth watching"))


Positive Review
Negative Review


In [17]:
print(predict_sentiment("It was okay, not great but not terrible"))
print(predict_sentiment("I hated every minute of this movie"))
print(predict_sentiment("One of the best films I have seen"))


Negative Review
Negative Review
Positive Review
