In [17]:
# Step 1: Import necessary libraries
import nltk
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy as nltk_accuracy
import string

# Download required NLTK data
nltk.download('movie_reviews')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [18]:
# Step 2: Preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords and apply stemming
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    processed_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words]

    return processed_tokens


In [11]:
# Step 3: Prepare dataset (Movie Reviews corpus)
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((list(movie_reviews.words(fileid)), category))


In [19]:
# Step 4: Feature extraction
def extract_features(words):
    return dict([(word, True) for word in words])

# Create feature sets
featuresets = [(extract_features(words), category) for (words, category) in documents]

# Step 5: Split data into train/test sets (80/20 split)
train_size = int(len(featuresets) * 0.8)
train_set = featuresets[:train_size]
test_set = featuresets[train_size:]

In [20]:
# Step 6: Train the classifier
classifier = NaiveBayesClassifier.train(train_set)


In [21]:
# Step 7: Evaluate the classifier
accuracy = nltk_accuracy(classifier, test_set)
print(f"Classifier Accuracy: {accuracy:.2%}")


Classifier Accuracy: 96.50%


In [22]:
# Step 8: Test with custom input
def predict_sentiment(text):
    processed_text = preprocess_text(text)
    features = extract_features(processed_text)
    return classifier.classify(features)

In [24]:
# Test examples
test_reviews = [
    "This movie is absolutely wonderful! Great acting and fantastic story.",
    "Terrible film with bad acting and boring plot.",
    "It was an okay movie, nothing special."
]

print("\nPredictions:")
for review in test_reviews:
    sentiment = predict_sentiment(review)
    print(f"Review: {review}\nSentiment: {sentiment}\n")


Predictions:
Review: This movie is absolutely wonderful! Great acting and fantastic story.
Sentiment: neg

Review: Terrible film with bad acting and boring plot.
Sentiment: neg

Review: It was an okay movie, nothing special.
Sentiment: neg

