In [37]:
#load librearies
import pandas as pd
#text preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string
#feature extraction and selection
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder
# classification
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [39]:
#load the dataset
df=pd.read_csv("IMDB Dataset.csv")

In [40]:
#sample of dataset
df.head(4)

Unnamed: 0,review,sentiment,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13
0,One of the other reviewers has mentioned that ...,positive,,,,,,,,,,,,
1,A wonderful little production. <br /><br />The...,positive,,,,,,,,,,,,
2,I thought this was a wonderful way to spend ti...,positive,,,,,,,,,,,,
3,Basically there's a family where a little boy ...,negative,,,,,,,,,,,,


In [43]:
# Dataset Overview: IMDB Movie Review Dataset
#working with a dataset that contains movie reviews along with their sentiment labels. 

#Column Name	Description

#review	This column contains the actual text of the movie review. It's usually unstructured (raw) text, and may include HTML tags, punctuation, mixed cases, etc.

#sentiment	This column indicates the sentiment label for the review — typically either "positive" or "negative".

# Use the review column as input and sentiment column as output:

#Text Preprocessing-->review

#Feature Extraction (TF-IDF)-->	review

#Feature Selection (Chi-Square)-->	review, sentiment

#Classification (Naive Bayes)--> review, sentiment

#Evaluation (Precision, Recall) --> Model predictions vs sentiment

In [45]:
#columns of dataset
df.columns

Index(['review', 'sentiment', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4',
       'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9',
       'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13'],
      dtype='object')

In [47]:
#check for null values
df.isnull().sum()

review             0
sentiment          0
Unnamed: 2     49998
Unnamed: 3     49998
Unnamed: 4     49998
Unnamed: 5     49998
Unnamed: 6     49998
Unnamed: 7     49998
Unnamed: 8     49998
Unnamed: 9     49998
Unnamed: 10    49998
Unnamed: 11    49998
Unnamed: 12    49998
Unnamed: 13    49998
dtype: int64

In [49]:
#check the shape of dataset
df.shape

(49999, 14)

In [51]:
#information of dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49999 entries, 0 to 49998
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   review       49999 non-null  object
 1   sentiment    49999 non-null  object
 2   Unnamed: 2   1 non-null      object
 3   Unnamed: 3   1 non-null      object
 4   Unnamed: 4   1 non-null      object
 5   Unnamed: 5   1 non-null      object
 6   Unnamed: 6   1 non-null      object
 7   Unnamed: 7   1 non-null      object
 8   Unnamed: 8   1 non-null      object
 9   Unnamed: 9   1 non-null      object
 10  Unnamed: 10  1 non-null      object
 11  Unnamed: 11  1 non-null      object
 12  Unnamed: 12  1 non-null      object
 13  Unnamed: 13  1 non-null      object
dtypes: object(14)
memory usage: 5.3+ MB


In [53]:
#description of dataset
df.describe()

Unnamed: 0,review,sentiment,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13
count,49999,49999,1,1,1,1,1,1,1,1,1,1,1,1
unique,49581,3,1,1,1,1,1,1,1,1,1,1,1,1
top,Loved today's show!!! It was a variety and not...,positive,Jerry and David Zucker cast one of its stars ...,a glorious take-off of old U.S. detective sho...,America's answer to 'Inspector Clouseau'. It ...,unnoticed absurdities,and recurring characters such as Johnny the s...,but I think it was about right. The concept c...,'Police Squad' made a successful transfer to ...,when the first of the 'Naked Gun' trilogy was...,Jerry,David,"and Leslie had the last laugh.""",positive
freq,5,24999,1,1,1,1,1,1,1,1,1,1,1,1


In [55]:
#step 3 :text processing

In [57]:
# Initialize stemmer and stopwords
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

In [59]:
# Define a preprocessing function
def preprocess(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize
    words = word_tokenize(text)
    
    # Remove stopwords and apply stemming
    filtered_words = [stemmer.stem(word) for word in words if word not in stop_words]
    
    return ' '.join(filtered_words)

In [None]:
# Apply the preprocessing function to the review column
df['clean_review'] = df['review'].apply(preprocess)

In [None]:
# Display the first few preprocessed reviews
print(df[['review', 'clean_review']].head(4))

In [27]:
# Step 4-->Feature extraction using Tf-idf(term frequency inverse document)

In [28]:
# Convert sentiment (target) to binary labels: 0 = negative, 1 = positive
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['sentiment'])  # e.g., ['positive', 'negative'] → [1, 0]

In [29]:
# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)  # You can adjust max_features if needed
X_tfidf = tfidf.fit_transform(df['clean_review'])

In [30]:
print("Original TF-IDF shape:", X_tfidf.shape)

Original TF-IDF shape: (49999, 5000)


In [None]:
# Step 5-->Feature section using chi-square test

In [None]:
# Chi-square Feature Selection: select top 1000 features
selector = SelectKBest(score_func=chi2, k=1000)
X_selected = selector.fit_transform(X_tfidf, y)

In [None]:
print("Selected features shape:", X_selected.shape)

In [None]:
# Step 6-->Classification Using Naive Bayes
print("Split the data into training and testing sets")

print("Train the MultinomialNB model")

print("Predict sentiment labels for the test set")

In [None]:
# Split the data: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

In [None]:
# Initialize and train the Multinomial Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

In [None]:
# Predict on test data
y_pred = model.predict(X_test)

In [None]:
# Evaluate the model
print(classification_report(y_test, y_pred, target_names=['negative', 'positive']))


In [None]:
# Step 7--> Evaluate Using Precision and Recall

In [None]:
from sklearn.metrics import precision_score, recall_score
# Calculate precision and recall
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print("Precision (Positive class):", precision)
print("Recall (Positive class):", recall)

In [None]:
# Result Analysis-->

print("Precision (86.3%): Most predicted positive reviews are truly positive.")

print("Recall (87.3%): The model correctly identifies most actual positive reviews.")

print("Conclusion: The model performs well with balanced precision and recall, making it reliable for sentiment analysis.")