# NLP Sentiment Analysis

Perform sentiment analysis using Python's NLTK (Natural Language Toolkit) library.

Use the `movie_reviews` corpus, which contains 2,000 movie reviews pre-labeled as either "positive" or "negative."

Build a **Naive Bayes classifier** - a common and effective model for text classification.

### Data Preprocessing and Feature Extraction

In [1]:
# Import modules
import nltk
import random
from nltk.corpus import stopwords
from nltk import bigrams
import string

# Download data and tokenizer
nltk.download('movie_reviews')
nltk.download('stopwords')
nltk.download('punkt_tab')

# Load the movie_reviews corpus
from nltk.corpus import movie_reviews

# Create a list of (review, sentiment) tuples
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        # Add a tuple of (list_of_words, category)
        documents.append((list(movie_reviews.words(fileid)), category))

# Shuffle for better training and testing
random.shuffle(documents)

# Process documents to include both words and bigrams
# Bigrams are helpful for cases like "not good"
processed_documents = []
stop_words = set(stopwords.words('english'))

for w_list, category in documents:
    # Clean words: lowercase, remove stopwords and punctuation
    clean_words = [w.lower() for w in w_list 
                   if w.lower() not in stop_words 
                   and w.lower() not in string.punctuation]
    
    # Create bigrams for this document
    doc_bigrams = [f"{w1}_{w2}" for w1, w2 in bigrams(clean_words)]
    
    # Combine words and bigrams for this document
    processed_documents.append((clean_words + doc_bigrams, category))

# Display results
print(f"Successfully loaded {len(documents)} documents.")
print(f"Processed {len(processed_documents)} documents with words and bigrams.")

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/tereuter/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tereuter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/tereuter/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Successfully loaded 2000 documents.
Processed 2000 documents with words and bigrams.


In [2]:
# Build feature list from processed documents (which include both words and bigrams)
all_features = []
for doc, category in processed_documents:
    all_features.extend(doc)

# Get the frequency distribution of all features
all_features_freq = nltk.FreqDist(all_features)

# Select top features - use top 5000 most common features
word_features = [word for word, count in all_features_freq.most_common(5000)]

# Print the 20 most common features
print("20 most common features:", all_features_freq.most_common(20))
print(f"Total unique features: {len(all_features_freq)}")
print(f"Selected features for classification: {len(word_features)}")

20 most common features: [('film', 9517), ('one', 5852), ('movie', 5771), ('like', 3690), ('even', 2565), ('good', 2411), ('time', 2411), ('story', 2169), ('would', 2109), ('much', 2049), ('character', 2020), ('also', 1967), ('get', 1949), ('two', 1911), ('well', 1906), ('characters', 1859), ('first', 1836), ('--', 1815), ('see', 1749), ('way', 1693)]
Total unique features: 570174
Selected features for classification: 5000


In [3]:
# Create a feature dictionary
def find_features(document_words):
    """
    Takes a list of words from a review and returns a dictionary
    of features indicating which of the top words are present.
    """
    words_in_doc = set(document_words)
    features = {}
    for w in word_features:
        features[w] = (w in words_in_doc)
    return features

# Create feature sets using processed documents (which include bigrams)
featuresets = [(find_features(rev), category) for (rev, category) in processed_documents]

print(f"Created {len(featuresets)} feature sets for training and testing.")

Created 2000 feature sets for training and testing.


### Model Training

In [4]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
data_train, data_test = train_test_split(featuresets, test_size=0.20, random_state=113)

# Train a Naive Bayes classifier
clf = nltk.NaiveBayesClassifier.train(data_train)

### Model Evaluation

In [5]:
# Evaluate accuracy, precision, and recall using sklearn
from sklearn.metrics import accuracy_score, precision_score, recall_score
y_test = [label for (features, label) in data_test]
y_pred = [clf.classify(features) for (features, label) in data_test]
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.1f}%")
print(f"Precision: {precision_score(y_test, y_pred, pos_label='pos') * 100:.1f}%")
print(f"Recall: {recall_score(y_test, y_pred, pos_label='pos') * 100:.1f}%")

Accuracy: 81.2%
Precision: 88.6%
Recall: 73.9%


In [6]:
# Show the most informative features - words most indicative of a positive/negative label
clf.show_most_informative_features(20)

Most Informative Features
                   sucks = True              neg : pos    =     14.6 : 1.0
               one_worst = True              neg : pos    =     13.8 : 1.0
              one_better = True              pos : neg    =     12.7 : 1.0
               ludicrous = True              neg : pos    =     11.5 : 1.0
                 idiotic = True              neg : pos    =     10.7 : 1.0
            breathtaking = True              pos : neg    =     10.1 : 1.0
             magnificent = True              pos : neg    =     10.1 : 1.0
             outstanding = True              pos : neg    =      9.8 : 1.0
                  seagal = True              neg : pos    =      9.4 : 1.0
                  turkey = True              neg : pos    =      9.1 : 1.0
          extremely_well = True              pos : neg    =      8.8 : 1.0
               marvelous = True              pos : neg    =      8.8 : 1.0
                  finest = True              pos : neg    =      8.3 : 1.0