# NLP Sentiment Analysis

Perform sentiment analysis using Python's NLTK (Natural Language Toolkit) library.

Use the `movie_reviews` corpus, which contains 2,000 movie reviews pre-labeled as either "positive" or "negative."

Build a **Naive Bayes classifier** - a common and effective model for text classification.

### Data Preprocessing and Feature Extraction

In [1]:
# Import modules
import nltk
import random

# Download data and tokenizer
nltk.download('movie_reviews')
nltk.download('stopwords')
nltk.download('punkt_tab')

# Load the movie_reviews corpus
from nltk.corpus import movie_reviews

# Create a list of (review, sentiment) tuples
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        # Add a tuple of (list_of_words, category)
        documents.append((list(movie_reviews.words(fileid)), category))

# Shuffle for better training and testing
random.shuffle(documents)

# Display results
print(f"Successfully loaded {len(documents)} documents.")

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/tereuter/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tereuter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/tereuter/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Successfully loaded 2000 documents.


In [2]:
# Clean and tokenize words
from nltk.corpus import stopwords
import string

# Get English stopwords
stop_words = set(stopwords.words('english'))

# Get all words from all reviews, lowercase, and remove stopwords/punctuation
all_words = []
for w_list, category in documents:
    for w in w_list:
        if w.lower() not in stop_words and w.lower() not in string.punctuation:
            all_words.append(w.lower())

# Get the frequency distribution of all words
all_words_freq = nltk.FreqDist(all_words)

# Print the 20 most common words
print("Most common words:", all_words_freq.most_common(20))

# Use the most common words as features
word_features = [item[0] for item in all_words_freq.most_common(20000)]

Most common words: [('film', 9517), ('one', 5852), ('movie', 5771), ('like', 3690), ('even', 2565), ('time', 2411), ('good', 2411), ('story', 2169), ('would', 2109), ('much', 2049), ('character', 2020), ('also', 1967), ('get', 1949), ('two', 1911), ('well', 1906), ('characters', 1859), ('first', 1836), ('--', 1815), ('see', 1749), ('way', 1693)]


In [3]:
# Create a feature dictionary
def find_features(document_words):
    """
    Takes a list of words from a review and returns a dictionary
    of features indicating which of the top words are present.
    """
    words_in_doc = set(document_words)
    features = {}
    for w in word_features:
        features[w] = (w in words_in_doc)
    return features

# Create feature sets for all documents
featuresets = [(find_features(rev), category) for (rev, category) in documents]

# Display example feature set
print("Example Feature Set:")
print(featuresets[0][0])
print("Sentiment:", featuresets[0][1])

Example Feature Set:
Sentiment: neg


### Model Training

In [4]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
data_train, data_test = train_test_split(featuresets, test_size=0.20, random_state=113)

# Train the Naive Bayes classifier
classifier = nltk.NaiveBayesClassifier.train(data_train)

### Model Evaluation

In [5]:
# Evaluate the classifier, using unseen test data
accuracy = nltk.classify.accuracy(classifier, data_test) * 100
print(f"\nClassifier Accuracy: {accuracy:.2f}%")

# Show the most informative features
# These are the words the model found most indicative of a positive/negative label
classifier.show_most_informative_features(20)


Classifier Accuracy: 81.75%
Most Informative Features
               insulting = True              neg : pos    =     15.7 : 1.0
             outstanding = True              pos : neg    =     15.3 : 1.0
                  avoids = True              pos : neg    =     11.0 : 1.0
             mcconaughey = True              pos : neg    =     11.0 : 1.0
                    3000 = True              neg : pos    =     10.3 : 1.0
               animators = True              pos : neg    =     10.3 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
              schumacher = True              neg : pos    =     10.3 : 1.0
                    slip = True              pos : neg    =     10.3 : 1.0
               addresses = True              pos : neg    =      9.7 : 1.0
                  hudson = True              neg : pos    =      9.7 : 1.0
               ludicrous = True              neg : pos    =      9.6 : 1.0
               marvelous = True              

### Classify input text

In [6]:
from nltk.tokenize import word_tokenize
def classify_sentiment(text):
    """
    Classifies input text.
    """
    # Tokenize text
    words = word_tokenize(text, language="English", preserve_line=False)
    
    # Clean words (lowercase, remove stopwords/punctuation)
    clean_words = []
    for w in words:
        if w.lower() not in stop_words and w.lower() not in string.punctuation:
            clean_words.append(w.lower())
    
    # Extract features with find_features function
    features = find_features(clean_words)
    
    # Classify text
    return classifier.classify(features)

In [7]:
# See what the classifier predicts given some example sentences
test_sentence_1 = "This was an amazing movie! I loved it."
print(f"'{test_sentence_1}' -> {classify_sentiment(test_sentence_1)}")

test_sentence_2 = "I was so bored. The whole thing was slow and predictable."
print(f"'{test_sentence_2}' -> {classify_sentiment(test_sentence_2)}")

test_sentence_3 = "The movie was okay, not great but not terrible."
print(f"'{test_sentence_3}' -> {classify_sentiment(test_sentence_3)}")

'This was an amazing movie! I loved it.' -> neg
'I was so bored. The whole thing was slow and predictable.' -> neg
'The movie was okay, not great but not terrible.' -> neg
