In [None]:
# 📌 Step 1: Imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from nltk.corpus import movie_reviews, stopwords
from nltk import download
import numpy as np

# Download NLTK resources if needed
download('movie_reviews')
download('stopwords')

In [None]:
# 📌 Step 2: Prepare Data

stop_words = set(stopwords.words('english'))

# Function to clean and join words into a single string
def get_clean_text(fileids):
    docs = []
    for fileid in fileids:
        words = [w.lower() for w in movie_reviews.words(fileid) if w.isalpha() and w.lower() not in stop_words]
        docs.append(" ".join(words))
    return docs

# Train/Test split (same split as before)
pos_files = movie_reviews.fileids('pos')
neg_files = movie_reviews.fileids('neg')

train_pos = pos_files[:800]
test_pos = pos_files[800:]
train_neg = neg_files[:800]
test_neg = neg_files[800:]

train_files = train_pos + train_neg
test_files = test_pos + test_neg

train_labels = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_labels = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

# Clean text data
train_texts = get_clean_text(train_files)
test_texts = get_clean_text(test_files)


In [None]:
# 📌 Step 3: Vectorize using TF-IDF

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_texts)
X_test  = vectorizer.transform(test_texts)

print(f"TF-IDF matrix shape: {X_train.shape}")


In [None]:
# 📌 Step 4: Train Naive Bayes with TF-IDF features

model = MultinomialNB()
model.fit(X_train, train_labels)

preds = model.predict(X_test)
acc = accuracy_score(test_labels, preds)

print(f"TF-IDF Naive Bayes Accuracy: {acc:.4f}")


In [None]:
# 📌 TF-IDF Vectorizer with unigrams + bigrams
# Optional step to include bigrams in the TF-IDF vectorization but with reducing the minimum document frequency to 2

vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2)

# Fit on train, transform both
X_train = vectorizer.fit_transform(train_texts)
X_test  = vectorizer.transform(test_texts)

print(f"TF-IDF matrix shape (with bigrams): {X_train.shape}")

# Train Naive Bayes
model = MultinomialNB()
model.fit(X_train, train_labels)

preds = model.predict(X_test)
acc = accuracy_score(test_labels, preds)

print(f"TF-IDF + Bigrams Naive Bayes Accuracy: {acc:.4f}")


In [None]:
# 📌 Train a Logistic Regression classifier with TF-IDF feature
# This is an additional step to demonstrate the use of a different classifier.

from sklearn.linear_model import LogisticRegression

# 📌 Logistic Regression classifier
clf = LogisticRegression(max_iter=1000)

# Train it
clf.fit(X_train, train_labels)

# Predict on test data
preds = clf.predict(X_test)

# Compute accuracy
acc = accuracy_score(test_labels, preds)

print(f"TF-IDF matrix shape (with bigrams): {X_train.shape}")
print(f"TF-IDF + Bigrams Logistic Regression Accuracy: {acc:.4f}")
