# Sentiment Analysis Using the Natural Language Toolkit (NLTK)

In [26]:
# Dependencies
import re, string
import random

import nltk
# nltk.download('stopwords')
# nltk.download('twitter_samples')
# nltk.download('punkt') # helps you tokenize words and sentences
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
from nltk import FreqDist
from nltk.corpus import twitter_samples
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = stopwords.words('english')


In [4]:
# Taking Samples
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')

In [5]:
tweet_tokens = twitter_samples.tokenized('positive_tweets.json')

In [6]:
print(tweet_tokens[0])


['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']


#  Normalizing the Data

 Normalization in NLP is the process of converting a word to its canonical form.

Normalization helps group together words with the same meaning but different forms. Without normalization, “ran”, “runs”, and “running” would be treated as different words, even though you may want them to be treated as the same word. In this section, you explore stemming and lemmatization, which are two popular techniques of normalization.

In [12]:
print(pos_tag(tweet_tokens[0]))

[('#FollowFriday', 'JJ'), ('@France_Inte', 'NNP'), ('@PKuchly57', 'NNP'), ('@Milipol_Paris', 'NNP'), ('for', 'IN'), ('being', 'VBG'), ('top', 'JJ'), ('engaged', 'VBN'), ('members', 'NNS'), ('in', 'IN'), ('my', 'PRP$'), ('community', 'NN'), ('this', 'DT'), ('week', 'NN'), (':)', 'NN')]


From the list of tags, here is the list of the most common items and their meaning:

* NNP: Noun, proper, singular
* NN: Noun, common, singular or mass
* IN: Preposition or conjunction, subordinating
* VBG: Verb, gerund or present participle
* VBN: Verb, past participle


In [15]:
def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence



['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'be', 'top', 'engage', 'member', 'in', 'my', 'community', 'this', 'week', ':)']


In [19]:
for i,j in zip(lemmatize_sentence(tweet_tokens[0]),tweet_tokens[0]):
    print(f'{j:15} --> {i:10} ')

#FollowFriday   --> #FollowFriday 
@France_Inte    --> @France_Inte 
@PKuchly57      --> @PKuchly57 
@Milipol_Paris  --> @Milipol_Paris 
for             --> for        
being           --> be         
top             --> top        
engaged         --> engage     
members         --> member     
in              --> in         
my              --> my         
community       --> community  
this            --> this       
week            --> week       
:)              --> :)         


# Removing Noise from the Data 

Remove noise from the dataset. Noise is any part of the text that does not add meaning or information to data.

**Removing Hyperlinks/Twitter handles/Punctuations/Special characters/Stopwords**

Noise is specific to each project, so what constitutes noise in one project may not be in a different project. For instance, the most common words in a language are called stop words. Some examples of stop words are “is”, “the”, and “a”. They are generally irrelevant when processing language, unless a specific use case warrants their inclusion.

In [22]:

def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [25]:
print(remove_noise(tweet_tokens[0], stop_words))

['#followfriday', 'top', 'engage', 'member', 'community', 'week', ':)']


In [30]:
# Cleaning the tweets

In [28]:
positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))


Look at the Difference

In [31]:
print(positive_tweet_tokens[500])
print(positive_cleaned_tokens_list[500])

['Dang', 'that', 'is', 'some', 'rad', '@AbzuGame', '#fanart', '!', ':D', 'https://t.co/bI8k8tb9ht']
['dang', 'rad', '#fanart', ':d']


#  Determining Word Density

Analysis on textual data is to take out the word frequency. A single tweet is too small of an entity to find out the distribution of words, hence, the analysis of the frequency of words would be done on all positive tweets.



In [33]:

def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

all_pos_words = get_all_words(positive_cleaned_tokens_list)

In [34]:
freq_dist_pos = FreqDist(all_pos_words)
print(freq_dist_pos.most_common(10))

[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 333), ('...', 290), ('good', 283), ('get', 263), ('thank', 253)]


# Preparing Data for the Model

In [47]:
def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

In [48]:

positive_dataset = [(tweet_dict, "Positive")
                     for tweet_dict in positive_tokens_for_model]

negative_dataset = [(tweet_dict, "Negative")
                     for tweet_dict in negative_tokens_for_model]

dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[7000:]

# Building and Testing the Model

In [51]:
from nltk import classify
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is:", classify.accuracy(classifier, test_data))

print(classifier.show_most_informative_features(10))

Accuracy is: 0.9956666666666667
Most Informative Features
                      :( = True           Negati : Positi =   2043.4 : 1.0
                      :) = True           Positi : Negati =   1006.2 : 1.0
                     sad = True           Negati : Positi =     24.6 : 1.0
                     bam = True           Positi : Negati =     24.0 : 1.0
                followed = True           Negati : Positi =     23.3 : 1.0
                follower = True           Positi : Negati =     22.2 : 1.0
                    sick = True           Negati : Positi =     20.7 : 1.0
                     x15 = True           Negati : Positi =     17.4 : 1.0
                    glad = True           Positi : Negati =     17.3 : 1.0
                   enjoy = True           Positi : Negati =     15.2 : 1.0
None


Test

In [59]:
from nltk.tokenize import word_tokenize

custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again."
# custom_tweet = 'Congrats #SportStar on your 7th best goal from last season winning goal of the year :) #Baller #Topbin #oneofmanyworldies'

custom_tokens = remove_noise(word_tokenize(custom_tweet))

print(classifier.classify(dict([token, True] for token in custom_tokens)))

Negative


# Save the model

In [60]:
import pickle
f = open('saved_model/nltk_sentiment_classifier.pickle', 'wb')
pickle.dump(classifier, f)
f.close()

In [56]:
# # Loading model
# f = open('saved_model/nltk_sentiment_classifier.pickle', 'rb')
# classifier = pickle.load(f)
# f.close()