# Sentiment Analysis

## Import Libraries

In [5]:
import nltk

# simple analyzer
from nltk.sentiment import SentimentIntensityAnalyzer

# advanced analyzer
from nltk.corpus import movie_reviews
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy
import random

## Simple Analyzer
- This example uses an existing pre-trained model from NLTK
- `vader_lexicon` labels words as positive or negative
- `sia` uses the Vader lexicon and calculates a total polarity score for the text

In [6]:
nltk.download('vader_lexicon')

def analyze_sentiment(text):
    sia = SentimentIntensityAnalyzer()
    sentiment = sia.polarity_scores(text)
    print(sentiment)

analyze_sentiment('NLTK is a great library for Natural Language Processing!')

{'neg': 0.0, 'neu': 0.465, 'pos': 0.535, 'compound': 0.784}


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Me\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


- `neg` is negative score
- `neu` is neutral score
- `pos` is positive score
- `compound` is the net score for the whole text

## Advanced Analyzer

### Preprocess Dataset
- Cons to using pre-trained model
    - Can be biased
    - May be computationally expensive to train and use
    - May not be well-suited to the domain of the task
- We can create our own analyzer
- We use NLTK's `movie_reviews` dataset
- Then we create a list of document tuples
    - List of words
    - Category
- Finally, we shuffle the documents to limit bias

In [7]:
nltk.download('movie_reviews')

documents = [
    (list(movie_reviews.words(file_id)), category)
    for category in movie_reviews.categories()
    for file_id in movie_reviews.fileids(category)
]

random.shuffle(documents)

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Me\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\movie_reviews.zip.


### Feature Extractor
- We find the 2,000 most common words among all the movie reviews
- To extract features, we indicate if that word is in the corpus or not

In [8]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]

def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features


### Train Classifier

In [9]:
feature_sets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = feature_sets[100:], feature_sets[:100]
classifier = NaiveBayesClassifier.train(train_set)

### Evaluate Classifier
- Print the final accuracy
- Show top 5 most important features

In [10]:
print(accuracy(classifier, test_set))
classifier.show_most_informative_features(5)

0.82
Most Informative Features
   contains(outstanding) = True              pos : neg    =     11.1 : 1.0
        contains(seagal) = True              neg : pos    =      7.4 : 1.0
   contains(wonderfully) = True              pos : neg    =      7.3 : 1.0
         contains(mulan) = True              pos : neg    =      7.0 : 1.0
         contains(damon) = True              pos : neg    =      5.9 : 1.0


- The word `outstanding` indicates a high chance that the sentence is positive
- On the other hand, `seagal` most likely indicates a negative sentence 