In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [2]:
# Assuming you have the IMDb dataset in a CSV file named "imdb_dataset.csv"
data = pd.read_csv("imdb.csv")

# Check the structure of the dataset
print(data.head())


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [3]:
# Drop any rows with missing values (optional)
data.dropna(inplace=True)

# Convert text to lowercase
data['review'] = data['review'].apply(lambda x: x.lower())

# Remove punctuation
data['review'] = data['review'].str.replace('[^\w\s]', '')

# Tokenization (using NLTK)
nltk.download('punkt')
data['tokens'] = data['review'].apply(lambda x: word_tokenize(x))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ruhal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [4]:
# Remove stopwords (common words like 'the', 'is', etc.)
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
data['tokens'] = data['tokens'].apply(lambda x: [word for word in x if word not in stop_words])

# Join the tokens back to form sentences
data['cleaned_review'] = data['tokens'].apply(lambda x: ' '.join(x))

# Split the data into training and testing sets
x = data['cleaned_review']
y = data['sentiment']  # Assuming you have a 'sentiment' column with labels (positive/negative)

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ruhal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [5]:
# Vectorize the text data
vectorizer = CountVectorizer()
x_train_vectorized = vectorizer.fit_transform(x_train)
x_test_vectorized = vectorizer.transform(x_test)

# Create and train the Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(x_train_vectorized, y_train)

# Make predictions on the test set
y_pred = classifier.predict(x_test_vectorized)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 85.87%

Classification Report:
               precision    recall  f1-score   support

    negative       0.84      0.88      0.86      4961
    positive       0.87      0.84      0.86      5039

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000


Confusion Matrix:
 [[4349  612]
 [ 801 4238]]
