In [4]:
import nltk
from nltk.corpus import movie_reviews
import random
import re
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

# Download NLTK resources
nltk.download('movie_reviews')
nltk.download('stopwords')

# Get all words from the movie reviews and create a frequency distribution
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())

# Get the 2000 most common words as in the original implementation
words_features_raw = list(all_words)[:2000]

print("Original first 20 words:")
print(words_features_raw[:20])

# Step 1: Remove punctuation using regex
words_features_no_punct = []
for word in words_features_raw:
    # Remove punctuation from each word
    cleaned_word = re.sub(r'[^\w\s]', '', word)
    if cleaned_word:  # Add only if the word is not empty after cleaning
        words_features_no_punct.append(cleaned_word)

print("\nAfter removing punctuation (first 20):")
print(words_features_no_punct[:20])

# Step 2: Apply stemming
porter = PorterStemmer()
words_features_stemmed = [porter.stem(word) for word in words_features_no_punct]

print("\nAfter stemming (first 20):")
print(words_features_stemmed[:20])

# Step 3: Remove stop words
stop = stopwords.words('english')
words_features = [word for word in words_features_stemmed if word not in stop]

print("\nAfter removing stop words (first 20):")
print(words_features[:20])

# Now continue with the original procedure as in LN5

# Function to extract features from documents
def document_features(document):
    document_words = set(document)
    features = {}
    for word in words_features:
        features[word] = (word in document_words)
    return features

# Prepare documents as in the original notebook
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Shuffle documents
random.shuffle(documents)

# Create feature sets
featuresets = [(document_features(d), c) for (d, c) in documents]

# Split into training and test sets as in the original
train_set, test_set = featuresets[100:], featuresets[:100]

# Train the classifier
model = nltk.NaiveBayesClassifier.train(train_set)

# Print classification result for the first test document
print("\nClassification of first test document:")
print(f"Predicted: {model.classify(test_set[0][0])}")
print(f"Actual: {test_set[0][1]}")

# Calculate and print accuracy
accuracy = nltk.classify.accuracy(model, test_set)
print(f"\nAccuracy: {accuracy:.4f}")

# Show most informative features
print("\nMost informative features:")
model.show_most_informative_features(5)

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Original first 20 words:
[',', 'the', '.', 'a', 'and', 'of', 'to', "'", 'is', 'in', 's', '"', 'it', 'that', '-', ')', '(', 'as', 'with', 'for']

After removing punctuation (first 20):
['the', 'a', 'and', 'of', 'to', 'is', 'in', 's', 'it', 'that', 'as', 'with', 'for', 'his', 'this', 'film', 'i', 'he', 'but', 'on']

After stemming (first 20):
['the', 'a', 'and', 'of', 'to', 'is', 'in', 's', 'it', 'that', 'as', 'with', 'for', 'hi', 'thi', 'film', 'i', 'he', 'but', 'on']

After removing stop words (first 20):
['hi', 'thi', 'film', 'one', 'movi', 'wa', 'ha', 'like', 'even', 'onli', 'good', 'time', 'stori', 'would', 'much', 'charact', 'also', 'get', 'two', 'well']

Classification of first test document:
Predicted: pos
Actual: pos

Accuracy: 0.6900

Most informative features:
Most Informative Features
                  seagal = True              neg : pos    =     11.7 : 1.0
                   mulan = True              pos : neg    =      8.3 : 1.0
                   damon = True             