# NLP Sentiment Analysis

Perform sentiment analysis using Python's NLTK (Natural Language Toolkit) library.

Use the `movie_reviews` corpus, which contains 2,000 movie reviews pre-labeled as either "positive" or "negative."

Build a **Naive Bayes classifier** - a common and effective model for text classification.

### Data Preprocessing and Feature Extraction

In [None]:
# Import modules
import nltk
import random

# Download data and tokenizer
nltk.download('movie_reviews')
nltk.download('stopwords')
nltk.download('punkt_tab')

# Load the movie_reviews corpus
from nltk.corpus import movie_reviews

# Create a list of (review, sentiment) tuples
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        # Add a tuple of (list_of_words, category)
        documents.append((list(movie_reviews.words(fileid)), category))

# Shuffle for better training and testing
random.shuffle(documents)

# Display results
print(f"Successfully loaded {len(documents)} documents.")

In [22]:
# Clean and tokenize words
from nltk.corpus import stopwords
import string

# Get English stopwords
stop_words = set(stopwords.words('english'))

# Keep sentiment-critical words for bigrams (cases like "not good")
sentiment_words = {'not', 'no', 'never', 'nor', 'very', 'too', 'but', 'however', 'although'}
stop_words = stop_words - sentiment_words

# Get all words from all reviews, lowercase, and remove stopwords/punctuation
all_words = []
for w_list, category in documents:
    for w in w_list:
        if w.lower() not in stop_words and w.lower() not in string.punctuation:
            all_words.append(w.lower())

# Include bigrams to capture sentiments like "not good"
from nltk import bigrams
all_bigrams = []
for w_list, category in documents:
    clean_words = [w.lower() for w in w_list if w.lower() not in stop_words and w.lower() not in string.punctuation]
    all_bigrams.extend([f"{w1}_{w2}" for w1, w2 in bigrams(clean_words)])

# Combine with all_words
all_features = all_words + all_bigrams
all_features_freq = nltk.FreqDist(all_features)
word_features = [item[0] for item in all_features_freq.most_common(3000)]

# Get the frequency distribution of all words
all_words_freq = nltk.FreqDist(all_words)

# Print the 20 most common words
print("20 most common words:", all_words_freq.most_common(20))

# Use the most common words as features
word_features = [item[0] for item in all_words_freq.most_common(20000)]

20 most common words: [('film', 9517), ('but', 8634), ('one', 5852), ('movie', 5771), ('not', 5577), ('like', 3690), ('even', 2565), ('no', 2472), ('good', 2411), ('time', 2411), ('story', 2169), ('would', 2109), ('much', 2049), ('character', 2020), ('also', 1967), ('get', 1949), ('two', 1911), ('well', 1906), ('very', 1863), ('characters', 1859)]


In [3]:
# Create a feature dictionary
def find_features(document_words):
    """
    Takes a list of words from a review and returns a dictionary
    of features indicating which of the top words are present.
    """
    words_in_doc = set(document_words)
    features = {}
    for w in word_features:
        features[w] = (w in words_in_doc)
    return features

# Create feature sets for all documents
featuresets = [(find_features(rev), category) for (rev, category) in documents]

Example Feature Set:
Sentiment: neg


### Model Training

In [23]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
data_train, data_test = train_test_split(featuresets, test_size=0.20, random_state=113)

# Train a Naive Bayes classifier
clf = nltk.NaiveBayesClassifier.train(data_train)

### Model Evaluation

In [24]:
# Evaluate accuracy, precision, and recall using sklearn
from sklearn.metrics import accuracy_score, precision_score, recall_score
y_test = [label for (features, label) in data_test]
y_pred = [clf.classify(features) for (features, label) in data_test]
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.1f}%")
print(f"Precision: {precision_score(y_test, y_pred, pos_label='pos') * 100:.1f}%")
print(f"Recall: {recall_score(y_test, y_pred, pos_label='pos') * 100:.1f}%")

Accuracy: 79.2%
Precision: 84.0%
Recall: 73.8%


In [18]:
# Show the most informative features - words most indicative of a positive/negative label
clf.show_most_informative_features(20)

Most Informative Features
               stupidity = True              neg : pos    =     16.7 : 1.0
           unintentional = True              neg : pos    =     12.1 : 1.0
              astounding = True              pos : neg    =     11.2 : 1.0
               uplifting = True              pos : neg    =     10.5 : 1.0
             outstanding = True              pos : neg    =     10.3 : 1.0
                     sat = True              neg : pos    =     10.0 : 1.0
                    slip = True              pos : neg    =      9.8 : 1.0
                    3000 = True              neg : pos    =      9.5 : 1.0
                  seagal = True              neg : pos    =      9.5 : 1.0
               ludicrous = True              neg : pos    =      9.1 : 1.0
               affecting = True              pos : neg    =      9.1 : 1.0
             fascination = True              pos : neg    =      9.1 : 1.0
            manipulation = True              pos : neg    =      9.1 : 1.0