# NLP Sentiment Analysis

Perform sentiment analysis using Python's NLTK (Natural Language Toolkit) library.

Use the `movie_reviews` corpus, which contains 2,000 movie reviews pre-labeled as either "positive" or "negative."

Build a **Naive Bayes classifier** - a common and effective model for text classification.

### Data Preprocessing and Feature Extraction

In [20]:
# Import modules
import nltk
from nltk.corpus import stopwords
from nltk import bigrams, trigrams
from nltk.stem import WordNetLemmatizer
import string

# # Download data and tokenizer, if needed
# nltk.download('movie_reviews')
# nltk.download('stopwords')
# nltk.download('punkt_tab')
# nltk.download('wordnet')

# Load the movie_reviews corpus
from nltk.corpus import movie_reviews

# Create 'documents' as a list of (review, sentiment) tuples
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        # Add a tuple of (list_of_words, category)
        documents.append((list(movie_reviews.words(fileid)), category))

# Process documents to include words, bigrams, and trigrams
# Bigrams (2-word phrases) and trigrams (3-word phrases) capture context
# Example: "not good" as bigram, "not very good" as trigram
processed_documents = []
stop_words = set(stopwords.words('english'))

# Keep sentiment-critical stop words (e.g. negation), removing them from stop_words set
sentiment_words = {
    'not', 'no', 'nor', 'never', 'neither', 'none', 
    'nobody', 'nothing','but', 'however', 'yet', 'although',
    'though', 'very', 'too', 'so', 'such', 'quite'
}
stop_words = stop_words - sentiment_words

# Initialize lemmatizer to reduce words to their base form
# Lemmatization groups related words: "amazing", "amazingly", "amazed" â†’ "amaze"
# This reduces feature sparsity and improves generalization
lemmatizer = WordNetLemmatizer()

# Loop through each document to process words, bigrams, and trigrams
for w_list, category in documents:
    # Convert to lowercase, lemmatize, remove stopwords and punctuation
    # Lemmatization happens AFTER lowercasing for consistency
    clean_words = [lemmatizer.lemmatize(w.lower()) for w in w_list 
                   if w.lower() not in stop_words 
                   and w.lower() not in string.punctuation]
    
    # Create bigrams (2-word combinations) to capture phrases like "not_good"
    # Bigrams help the model understand that "not good" has negative sentiment
    doc_bigrams = [f"{w1}_{w2}" for w1, w2 in bigrams(clean_words)]
    
    # Create trigrams (3-word combinations) to capture longer phrases like "not_very_good"
    # Trigrams provide even more context than bigrams for nuanced sentiment
    doc_trigrams = [f"{w1}_{w2}_{w3}" for w1, w2, w3 in trigrams(clean_words)]
    
    # Combine individual words, bigrams, and trigrams into a single feature list
    processed_documents.append((clean_words + doc_bigrams + doc_trigrams, category))

# Confirm documents were processed successfully
print(f"Successfully processed {len(processed_documents)} documents with words, bigrams, and trigrams.")

# Display an example processed document
print("Example processed document (first 20 tokens):")
print(processed_documents[0][0][:20], processed_documents[0][1])

Successfully processed 2000 documents with words, bigrams, and trigrams.
Example processed document (first 20 tokens):
['plot', 'two', 'teen', 'couple', 'go', 'church', 'party', 'drink', 'drive', 'get', 'accident', 'one', 'guy', 'dy', 'but', 'girlfriend', 'continues', 'see', 'life', 'nightmare'] neg


In [35]:
# Build a features list from processed_documents
all_features = []
for doc, category in processed_documents:
    all_features.extend(doc)

# Get the frequency distribution of all features
all_features_freq = nltk.FreqDist(all_features)

# Select the most frequent features to use for training (dimensionality reduction)
word_features = [word for word, count in all_features_freq.items() if count >= 5]

# Check results
print(f"Total unique features: {len(all_features_freq)}")
print(f"Selected features for classification: {len(word_features)}")
print("20 most common features:", all_features_freq.most_common(20))

Total unique features: 1254495
Selected features for classification: 27007
20 most common features: [('film', 11053), ('but', 8634), ('movie', 6977), ('one', 6028), ('not', 5577), ('character', 3879), ('like', 3789), ('so', 3683), ('time', 2979), ('get', 2814), ('scene', 2671), ('make', 2634), ('even', 2568), ('no', 2472), ('good', 2429), ('story', 2345), ('would', 2109), ('much', 2049), ('also', 1967), ('well', 1921)]


In [36]:
# Create a feature dictionary
def find_features(document_words):
    """
    Takes a list of words from a review and returns a dictionary
    of features indicating which of the top words are present.
    """
    # Convert document_words to a set for faster performance
    words_in_doc = set(document_words)

    # Initialize features dictionary
    features = {}

    # Loop through all the selected features (words and bigrams)
    for w in word_features:
        # If the word/bigram is in the document, set True, else False
        features[w] = (w in words_in_doc)
    
    # Return the complete features dictionary
    return features

# Call the find_features function to create feature sets for processed_documents
featuresets = [(find_features(rev), category) for (rev, category) in processed_documents]

# Confirm feature sets were created successfully
print(f"Created {len(featuresets)} feature sets for training and testing.")

# Display an example feature set
print("Example feature set (first 10 features):")
example_features, example_category = featuresets[0]
print({k: example_features[k] for k in list(example_features)[:10]}, example_category)

Created 2000 feature sets for training and testing.
Example feature set (first 10 features):
{'plot': True, 'two': True, 'teen': True, 'couple': True, 'go': True, 'church': True, 'party': True, 'drink': True, 'drive': True, 'get': True} neg


### Model Training

In [37]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
data_train, data_test = train_test_split(featuresets, test_size=0.20, random_state=113)

# Train a Naive Bayes classifier
clf = nltk.NaiveBayesClassifier.train(data_train)

### Model Evaluation

In [38]:
# Evaluate accuracy, precision, and recall using sklearn
from sklearn.metrics import accuracy_score, precision_score, recall_score
y_test = [label for (features, label) in data_test]
y_pred = [clf.classify(features) for (features, label) in data_test]
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.1f}%")
print(f"Precision: {precision_score(y_test, y_pred, pos_label='pos') * 100:.1f}%")
print(f"Recall: {recall_score(y_test, y_pred, pos_label='pos') * 100:.1f}%")

Accuracy: 83.0%
Precision: 85.8%
Recall: 78.9%


In [40]:
# Show the most informative features - words most indicative of a positive/negative label
clf.show_most_informative_features(20)

Most Informative Features
                  symbol = True              pos : neg    =     14.3 : 1.0
              video_game = True              neg : pos    =     13.7 : 1.0
                 idiotic = True              neg : pos    =     11.8 : 1.0
                  regard = True              pos : neg    =     11.0 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
                  avoids = True              pos : neg    =     10.3 : 1.0
              fairy_tale = True              pos : neg    =     10.3 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
                  hatred = True              pos : neg    =     10.3 : 1.0
             outstanding = True              pos : neg    =     10.3 : 1.0
               strongest = True              pos : neg    =     10.3 : 1.0
               stupidity = True              neg : pos    =     10.2 : 1.0
               atrocious = True              neg : pos    =      9.7 : 1.0