# Imports

In [1]:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from random import shuffle
from statistics import mean

# NLTK Basics
Following tutorial at https://realpython.com/python-nltk-sentiment-analysis/

In [2]:
# Downloading sample data from NLTK library
nltk.download([
    "names",
    "stopwords",
    "state_union",
    "twitter_samples",
    "movie_reviews",
    "averaged_perceptron_tagger",
    "vader_lexicon",
    "punkt"])

[nltk_data] Downloading package names to /Users/dylan/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/dylan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package state_union to
[nltk_data]     /Users/dylan/nltk_data...
[nltk_data]   Package state_union is already up-to-date!
[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/dylan/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/dylan/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/dylan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/dylan/nltk_data...
[nltk_data]   Packag

True

## _Compiling Data_

In [3]:
# Load corpus
words = [w for w in nltk.corpus.state_union.words() if w.isalpha()] # Filter only for words that comprise letters
print("corpus:", words[:10])

# Load stopwords
stopwords = nltk.corpus.stopwords.words("english")
print("stopwords:", stopwords[:10])

# Remove stopwords
words = [w for w in words if w.lower() not in stopwords]

corpus: ['PRESIDENT', 'HARRY', 'S', 'TRUMAN', 'S', 'ADDRESS', 'BEFORE', 'A', 'JOINT', 'SESSION']
stopwords: ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [9]:
# Tokenisation through NLTK
text = "This is a simple text."
simple_words = nltk.word_tokenize(text)
print("tokenisation:", simple_words)

tokenisation: ['This', 'is', 'a', 'simple', 'text', '.']


## _Frequency Distributions_

In [13]:
# Build a frequency distribution
# words: list[str] = nltk.word_tokenize(nltk.corpus.state_union.words())
fd = nltk.FreqDist(words)
print("frequency distribution:", fd)
print("most common words:", fd.most_common(5))
print("tabulated:")
fd.tabulate(5)

frequency distribution: <FreqDist with 13810 samples and 180589 outcomes>
most common words: [('must', 1568), ('people', 1291), ('world', 1128), ('year', 1097), ('America', 1076)]
tabulated:
   must  people   world    year America 
   1568    1291    1128    1097    1076 


In [17]:
# Frequency distributions can be queried by word for their frequency
print("frequency of 'america':", fd["america"])
print("frequency of 'America':", fd["America"])
print("frequency of 'AMERICA':", fd["AMERICA"]) # Case sensitive

# Normalise the frequency distribution to lowercase
lower_fd = nltk.FreqDist(w.lower() for w in words)
print("frequency of 'america':", lower_fd["america"])
print("frequency of 'America':", lower_fd["America"])
print("frequency of 'AMERICA':", lower_fd["AMERICA"]) # Case sensitive

frequency of 'america': 0
frequency of 'America': 1076
frequency of 'AMERICA': 3
frequency of 'america': 1079
frequency of 'America': 0
frequency of 'AMERICA': 0


## _Extracting Concordance and Collocations_

**Concordance**: Collection of word locations along with their context

In [23]:
# Build new word list including stopwords
# .concordance() uses nltk.Text class, also can be used with word list
text = nltk.Text(nltk.corpus.state_union.words())
print("concordance (straight to console):")
text.concordance("america", lines=5) # case insensitive

# Use .concordance_list() to get the results as a list
concordance_list = text.concordance_list("america", lines=5)
print("\nconcordance_list (as list):")
for entry in concordance_list:
    print(entry.line)

concordance (straight to console):
Displaying 5 of 1079 matches:
 would want us to do . That is what America will do . So much blood has already
ay , the entire world is looking to America for enlightened leadership to peace
beyond any shadow of a doubt , that America will continue the fight for freedom
 to make complete victory certain , America will never become a party to any pl
nly in law and in justice . Here in America , we have labored long and hard to 

concordance_list (as list):
 would want us to do . That is what America will do . So much blood has already
ay , the entire world is looking to America for enlightened leadership to peace
beyond any shadow of a doubt , that America will continue the fight for freedom
 to make complete victory certain , America will never become a party to any pl
nly in law and in justice . Here in America , we have labored long and hard to 


In [27]:
# ntlk.Text.vocab() returns a frequency distribution, similar to nltk.FreqDist()
words : list[str] = nltk.word_tokenize("This is a sample sample text.")
text = nltk.Text(words)
fd = text.vocab() # same as nltk.FreqDist(words)
fd.tabulate(5)

sample   This     is      a   text 
     2      1      1      1      1 


**Collocations**: Series of words that frequently appear together in a given text
Bigrams, Trigrams, Quadgrams

In [30]:
# Finding n-grams
words = [w for w in nltk.corpus.state_union.words() if w.isalpha()]

print("\nbigrams:")
bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(words)
bigram_finder.ngram_fd.tabulate(3) # Using ngram_fd to obtain fd of n-grams

print("\ntrigrams:")
trigram_finder = nltk.collocations.TrigramCollocationFinder.from_words(words)
trigram_finder.ngram_fd.tabulate(3)

print("\nquadgrams:")
quadgram_finder = nltk.collocations.QuadgramCollocationFinder.from_words(words)
quadgram_finder.ngram_fd.tabulate(3)



bigrams:
('of', 'the') ('in', 'the') ('to', 'the') 
         2599          1851          1143 

trigrams:
  ('the', 'United', 'States') ('the', 'American', 'people')        ('of', 'the', 'world') 
                          294                           185                           154 

quadgrams:
('of', 'the', 'United', 'States')         ('I', 'ask', 'you', 'to')   ('State', 'of', 'the', 'Union') 
                              110                                69                                58 


# NLTK Sentiment Analysis
**VADER**: NLTK's built-in, pretrained sentiment analyser

## _Basic Usage of VADER_

In [34]:
# Breakdown of VADER scores
# neg + neu + pos = 1
# -1 < compound < 0: negative
# compound = 0: neutral
# 0 < compound < 1: positive
sia = SentimentIntensityAnalyzer()
sia.polarity_scores("This is a good example.")

{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404}

In [37]:
# Using VADER on Twitter samples
tweets = [t.replace("://", "//") for t in nltk.corpus.twitter_samples.strings()] # raw tweets obtained as strings

def is_positive(tweet: str) -> bool:
    return sia.polarity_scores(tweet)["compound"] > 0

shuffle(tweets)
for tweet in tweets[:10]:
    print(f"{is_positive(tweet)}: {tweet}")

False: RT @faisalislam: Bingo: “@AndrewSparrow: Only 6% of #bbcqt viewers said debate might change their mind - Clegg got most switchers - http//…
True: It's not like it hurt my feelings or anything right?? :))))))))))))
False: 15 Days ago Danny took my wig and put it onto mark's head I want to go back there @thescript @TheScript_Danny :( ♥ https//t.co/9ojA3FPxKF
False: RT @joel_pearce: To all lefties voting Green, SNP, Lib Dem, Plaid etc etc: can you really deal with 5 more years of Tory gvt? I can't. #GE2…
False: The economic genius that is Margaret Farrier has made it to the Daily Record.

http//t.co/7cvSJDKKib http//t.co/CHTsLlmsUi
False: RT @mrmarksteel: Tomorrow Miliband will say 'let me tell you this, I'm not even voting for myself in case I do a deal with the SNP'.
False: A don't get how someone can be Scottish and not support SNP
True: 🌞🌞🌞 - :)))))))) stay perfect girly
True: @DaveHShaw Hi Dave, please contact our in-App support chat so that we can ensure you get it on time! :

In [42]:
# Using VADER on movie reviews
pos_review_ids = nltk.corpus.movie_reviews.fileids(categories=["pos"])
neg_review_ids = nltk.corpus.movie_reviews.fileids(categories=["neg"])
all_review_ids = pos_review_ids + neg_review_ids

In [39]:
# Rewrite is_positive() to obtain movie review by ID then split into sentences before using VADER to rate
def is_positive(review_id: str) -> bool:
    review = nltk.corpus.movie_reviews.raw(review_id)
    sentences = nltk.sent_tokenize(review)
    scores = [sia.polarity_scores(sentence)["compound"] for sentence in sentences]
    return mean(scores) > 0

In [43]:
# Test accuracy of VADER using is_positive() on movie reviews
shuffle(all_review_ids)
correct = 0
for review_id in all_review_ids:
    if is_positive(review_id):
        if review_id in pos_review_ids:
            correct += 1
    else:
        if review_id in neg_review_ids:
            correct += 1
accuracy = correct / len(all_review_ids)
print(f"Accuracy: {accuracy:.2%}")

Accuracy: 64.00%


## _Customising NLTK's Sentiment Analysis_
**Features** as data properties used for classification

In [50]:
# Selecting useful features
unwanted = nltk.corpus.stopwords.words("english")
unwanted.extend([w.lower() for w in nltk.corpus.names.words()]) # Add names to stopwords

def skip_unwanted(pos_tuple):
    word, tag = pos_tuple
    if not word.isalpha() or word in unwanted:
        return False
    if tag.startswith("NN"): # exclude nounds
        return False
    return True

# pos_tag() returns a list of tuples, each containing a word and its part of speech
pos_words = [word for word, tag in filter(
    skip_unwanted, nltk.pos_tag(nltk.corpus.movie_reviews.words(categories = ["pos"])) 
)]

neg_words = [word for word, tag in filter(
    skip_unwanted, nltk.pos_tag(nltk.corpus.movie_reviews.words(categories = ["neg"]))
)]

In [74]:
# Create fd for custom features
pos_fd = nltk.FreqDist(pos_words)
neg_fd = nltk.FreqDist(neg_words)

common_set = set(pos_fd).intersection(neg_fd)

for word in common_set:
    del pos_fd[word]
    del neg_fd[word]

top_pos = {word for word, count in pos_fd.most_common(100)}
top_neg = {word for word, count in neg_fd.most_common(100)}

print("top_pos:", top_pos)
print("top_neg:", top_neg)

top_pos: {'exhilarating', 'sobbing', 'textured', 'deft', 'horned', 'weaves', 'fei', 'falter', 'propelled', 'unrestrained', 'belgian', 'masterfully', 'shrek', 'organizing', 'balancing', 'sparks', 'tibbs', 'spacey', 'pun', 'ulee', 'amistad', 'lumumba', 'curdled', 'elegantly', 'addresses', 'understatement', 'embeth', 'argento', 'tale', 'vividly', 'tibetan', 'uncut', 'soviet', 'hanks', 'benefit', 'seahaven', 'kimble', 'galactic', 'ghost', 'societal', 'valjean', 'uncompromising', 'attentive', 'apostle', 'sweetback', 'broadcast', 'forceful', 'unassuming', 'claiborne', 'profile', 'en', 'perceived', 'weir', 'melancholy', 'indistinguishable', 'safely', 'unquestionably', 'kudos', 'superficially', 'unnerving', 'conveys', 'powerfully', 'lovingly', 'vertical', 'maximus', 'taxing', 'notoriously', 'motta', 'methodical', 'flynt', 'fa', 'rico', 'unzipped', 'biased', 'donkey', 'niccol', 'matches', 'pink', 'supreme', 'jedi', 'legally', 'funnest', 'brisk', 'redefines', 'danish', 'radio', 'freed', 'shangha

In [52]:
# Find bigrams for features
pos_bigram_finder = nltk.collocations.BigramCollocationFinder.from_words([
    w for w in nltk.corpus.movie_reviews.words(categories=["pos"]) if w.isalpha() and w not in unwanted
])
neg_bigram_finder = nltk.collocations.BigramCollocationFinder.from_words([
    w for w in nltk.corpus.movie_reviews.words(categories=["neg"]) if w.isalpha() and w not in unwanted
])

print("\npositive bigrams:")
print(pos_bigram_finder.ngram_fd.tabulate(3)) 

print("\nnegative bigrams:")
print(neg_bigram_finder.ngram_fd.tabulate(3))


positive bigrams:
('special', 'effects')        ('new', 'york')     ('even', 'though') 
                   179                    131                    120 
None

negative bigrams:
('special', 'effects')        ('new', 'york')     ('even', 'though') 
                   208                    118                    102 
None


## _Training and Using a Classifer_

In [54]:
# Extract features from a given piece of data
def extract_features(text):
    features = dict()
    wordcount = 0
    compound_scores = list()
    pos_scores = list()

    for sentence in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sentence):
            if word.lower() in top_pos:
                wordcount += 1
        compound_scores.append(sia.polarity_scores(word)["compound"])
        pos_scores.append(sia.polarity_scores(word)["pos"])

    features["mean_compound"] = mean(compound_scores)
    features["mean_pos"] = mean(pos_scores)
    features["wordcount"] = wordcount

    return features

features = [
    (extract_features(nltk.corpus.movie_reviews.raw(review)), "pos")
    for review in pos_review_ids
]

features.extend([
    (extract_features(nltk.corpus.movie_reviews.raw(review)), "neg")
    for review in neg_review_ids
])

In [63]:
# Training a classifier
train_count = len(features) // 4 # 25% of data for testing
shuffle(features)
classifier = nltk.NaiveBayesClassifier.train(features[:train_count])
classifier.show_most_informative_features(10)
nltk.classify.accuracy(classifier, features[train_count:]) # Test accuracy

Most Informative Features
               wordcount = 4                 pos : neg    =      4.9 : 1.0
               wordcount = 5                 pos : neg    =      4.3 : 1.0
               wordcount = 2                 pos : neg    =      3.3 : 1.0
               wordcount = 0                 neg : pos    =      1.8 : 1.0
               wordcount = 1                 pos : neg    =      1.4 : 1.0
           mean_compound = 0.0               pos : neg    =      1.0 : 1.0
                mean_pos = 0.0               pos : neg    =      1.0 : 1.0


0.6646666666666666

In [69]:
# Test on new data
new_pos_review = "'Kingdom of the Planet of the Apes' emerges as a formidable successor to the legacy left by Caesar, \
    the venerable leader whose memory still looms large over this burgeoning saga. Situated approximately three centuries \
    after the tumultuous events of 'War for the Planet of the Apes,' this standalone entry serves as both an homage to its \
    predecessors and a bold new beginning for the franchise. In the absence of Caesar, Noa steps into the spotlight as the \
    central protagonist, carrying the weight of a new era on his shoulders. Noa's character is intricately crafted, embodying \
    a complex blend of reverence for Caesar's teachings and an audacious willingness to challenge them. This nuanced portrayal \
    of Noa is pivotal to the narrative, as his reinterpretation of Caesar's iconic mantra—shifting from 'Apes together strong' \
    to the more inclusive 'No, together strong,' with humanity included—sets the philosophical and ideological groundwork for \
    the trilogy's future trajectory. The film delves deep into the fraught dynamics between humans and apes, refusing to simplify\
    the multifaceted layers of conflict and coexistence that define their interactions. This unflinching commitment to maintaining \
    the tension provides a rich soil for character growth and development, ensuring that the audience remains invested in the journey \
    of both species. Visually, 'Kingdom of the Planet of the Apes' is a testament to the continued technical prowess that the series \
    is known for. The seamless blend of motion-capture performances and cutting-edge visual effects not only brings the apes to life \
    with astonishing realism but also serves to elevate the emotional resonance of their struggles. The narrative is further enriched \
    by themes that explore leadership, identity, and the quest for a shared future, all of which resonate deeply within the current \
    social context. The film's ability to engage with these themes while delivering a gripping and entertaining story is a testament \
    to its thoughtful script and adept direction. In conclusion, 'Kingdom of the Planet of the Apes' stands as a compelling and \
    significant chapter in the storied franchise. It is a film that honors its roots while fearlessly forging ahead, offering \
    audiences a profound and thrilling experience that not only entertains but also invites reflection on our own world. With \
    Noa at the helm, the future of this new trilogy is ripe with potential, promising a cinematic journey that will continue \
    to captivate and challenge viewers for years to come."

new_neg_review = "I was really looking forward to watching this movie but was sadly disappointed. \
    The story line was boring and so were the characters.  I couldn't feel any emotional connection \
    with the characters unlike the previous movies.  The whole script was bland. I tried to convince \
    myself it was OK but realistically it's as entertaining as watching paint dry. Sadly for me however, \
    I was not entertained by the storyline or characters. Not many other people I spoke to liked it either."

new_pos_features = extract_features(new_pos_review)
new_neg_features = extract_features(new_neg_review)

print(new_pos_features)
print(classifier.classify(new_pos_features))

print(new_neg_features)
print(classifier.classify(new_neg_features))


{'mean_compound': 0.0, 'mean_pos': 0.0, 'wordcount': 1}
pos
{'mean_compound': 0.0, 'mean_pos': 0.0, 'wordcount': 0}
neg


In [73]:
for sentence in nltk.sent_tokenize(new_pos_review):
    for word in nltk.word_tokenize(sentence):
        if word.lower() in top_pos:
            print(word)

for sentence in nltk.sent_tokenize(new_neg_review):
    for word in nltk.word_tokenize(sentence):
        if word.lower() in top_neg:
            print(word)

audacious


# Comparing Additional Classifiers