In [1]:
#TOKENIZING

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize

TEXT = "Doing sentiment Analysis starting from the basics. But will it help me ? I sure think it will, why wouldn't it ?"
print(sent_tokenize(TEXT))

['Doing sentiment Analysis starting from the basics.', 'But will it help me ?', "I sure think it will, why wouldn't it ?"]


In [3]:
print(word_tokenize(TEXT))

['Doing', 'sentiment', 'Analysis', 'starting', 'from', 'the', 'basics', '.', 'But', 'will', 'it', 'help', 'me', '?', 'I', 'sure', 'think', 'it', 'will', ',', 'why', 'would', "n't", 'it', '?']


In [4]:
#STOP WORDS

In [5]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
print(stop_words)

{'should', 'they', "wasn't", 'won', 'does', 'don', 'do', 'not', 'what', 'that', 'into', "you'd", 'where', 'having', 'we', 'who', 'theirs', 'himself', 'ourselves', 'at', "doesn't", 'doesn', 'but', 'above', "couldn't", "hadn't", 'some', 'up', "that'll", 'there', "wouldn't", 'once', 'when', 'how', 'aren', 'while', "don't", 'too', 'wouldn', 'she', 'have', 'whom', 'or', 'here', "hasn't", 'shouldn', "aren't", 'yourselves', 'itself', 'for', 'under', 'his', "mightn't", "mustn't", "shouldn't", 'ma', 'off', 'until', 'haven', 'couldn', 'most', 's', 'nor', 'these', "shan't", 'with', 'through', 'against', 'myself', 'during', 'in', 'further', 'other', 'o', "haven't", 'her', 'i', 'he', "she's", 'was', 'being', 'will', 'hers', 'am', 'been', 've', 'so', 'all', 'after', "it's", "weren't", "you've", 'if', 'of', 'out', 'll', 'weren', "didn't", 'needn', 'ain', 'can', 'yours', 'just', 'mightn', 'a', 'doing', 'didn', 'between', 'y', 'those', 'be', 'no', 'hadn', 'wasn', 'on', 'again', 'isn', "needn't", 'an', 

In [6]:
filtered_sentence = [w for w in word_tokenize(TEXT) if not w in stop_words]

In [7]:
print(filtered_sentence)

['Doing', 'sentiment', 'Analysis', 'starting', 'basics', '.', 'But', 'help', '?', 'I', 'sure', 'think', ',', 'would', "n't", '?']


In [8]:
#STEMMING

In [9]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [10]:
words = ['ran','run','running','runner','runned']
for w in words:
    print(ps.stem(w))

ran
run
run
runner
run


In [11]:
#part of speech tagging
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer
train = state_union.raw('2005-GWBush.txt')
test = state_union.raw('2006-GWBush.txt')

In [12]:
sent_tokenizer = PunktSentenceTokenizer(train)

In [13]:
tokenized = sent_tokenizer.tokenize(test)

In [14]:
tokenized

["PRESIDENT GEORGE W. BUSH'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION\n \nJanuary 31, 2006\n\nTHE PRESIDENT: Thank you all.",
 'Mr. Speaker, Vice President Cheney, members of Congress, members of the Supreme Court and diplomatic corps, distinguished guests, and fellow citizens: Today our nation lost a beloved, graceful, courageous woman who called America to its founding ideals and carried on a noble dream.',
 'Tonight we are comforted by the hope of a glad reunion with the husband who was taken so long ago, and we are grateful for the good life of Coretta Scott King.',
 '(Applause.)',
 'President George W. Bush reacts to applause during his State of the Union Address at the Capitol, Tuesday, Jan.',
 '31, 2006.',
 "White House photo by Eric DraperEvery time I'm invited to this rostrum, I'm humbled by the privilege, and mindful of the history we've seen together.",
 'We have gathered under this Capitol dome in moments of national mourning and national ach

In [15]:
import nltk
for i in tokenized[:3]:
    words = word_tokenize(i)
    tagged = nltk.pos_tag(words)
    print(tagged)

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nat

In [16]:
#NAMED ENTITY RECOGNITION

In [17]:
import nltk
for i in tokenized[:3]:
    words = word_tokenize(i)
    tagged = nltk.pos_tag(words)
    #print(tagged)
    namedEnt = nltk.ne_chunk(tagged, binary=True)
    namedEnt.draw()

In [18]:
#LEMMATIZING

In [19]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [20]:
print(lemmatizer.lemmatize("python"))
print(lemmatizer.lemmatize("better", pos="a"))

python
good


In [21]:
#WORDNET synonyms, antonyms and word similarities

In [22]:
from nltk.corpus import wordnet

In [23]:
syns = wordnet.synsets('program')

In [24]:
print(syns[0].lemmas()[0].name())

plan


In [25]:
print(syns[1].examples())

['he proposed an elaborate program of public works', 'working mothers rely on the day care program']


In [26]:
synonyms = []
antonyms = []

for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())

print(synonyms)
print(antonyms)

['good', 'good', 'goodness', 'good', 'goodness', 'commodity', 'trade_good', 'good', 'good', 'full', 'good', 'good', 'estimable', 'good', 'honorable', 'respectable', 'beneficial', 'good', 'good', 'good', 'just', 'upright', 'adept', 'expert', 'good', 'practiced', 'proficient', 'skillful', 'skilful', 'good', 'dear', 'good', 'near', 'dependable', 'good', 'safe', 'secure', 'good', 'right', 'ripe', 'good', 'well', 'effective', 'good', 'in_effect', 'in_force', 'good', 'good', 'serious', 'good', 'sound', 'good', 'salutary', 'good', 'honest', 'good', 'undecomposed', 'unspoiled', 'unspoilt', 'good', 'well', 'good', 'thoroughly', 'soundly', 'good']
['evil', 'evilness', 'bad', 'badness', 'bad', 'evil', 'ill']


In [27]:
w1 = wordnet.synset('boat.n.01')
w2 = wordnet.synset('ship.n.01')
print(w1.wup_similarity(w2))

0.9090909090909091


In [28]:
#Sentiment Analysis

In [29]:
import random
from nltk.corpus import movie_reviews

In [39]:
documents = [(list(movie_reviews.words(fileid)),category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]

In [41]:
documents[1]

(['you',
  'may',
  'not',
  'have',
  'heard',
  'of',
  'lucas',
  ',',
  'or',
  'if',
  'you',
  'have',
  ',',
  'you',
  'might',
  'not',
  'think',
  'much',
  'of',
  'it',
  '.',
  'probably',
  'because',
  'it',
  'stars',
  'corey',
  'haim',
  ',',
  'or',
  'maybe',
  'because',
  'it',
  "'",
  's',
  'about',
  'a',
  'little',
  'geek',
  'who',
  'collects',
  'insects',
  ',',
  'or',
  'maybe',
  'because',
  'it',
  'doesn',
  "'",
  't',
  'feature',
  'slick',
  'one',
  '-',
  'liners',
  'or',
  'the',
  'chart',
  'topping',
  'soundtrack',
  'albums',
  'that',
  'were',
  'almost',
  'a',
  'requirement',
  'for',
  'teen',
  'films',
  'in',
  'the',
  '80',
  "'",
  's',
  '.',
  'however',
  ',',
  'you',
  'have',
  'been',
  'missing',
  'out',
  'on',
  'a',
  'true',
  'masterpeice',
  ',',
  'the',
  'best',
  'film',
  'of',
  '1986',
  ',',
  'and',
  'the',
  'best',
  'film',
  'about',
  'adolescent',
  'life',
  'ever',
  'made',
  '.',
  'cor

In [42]:
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

In [43]:
all_words = nltk.FreqDist(all_words)
print(all_words.most_common(15))

[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]


In [40]:
random.shuffle(documents)

In [44]:
word_features = list(all_words.keys())[:3000]

In [45]:
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features


In [46]:
print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))




In [47]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]

In [49]:
featuresets[1]

({'plot': False,
  ':': False,
  'two': True,
  'teen': True,
  'couples': False,
  'go': False,
  'to': True,
  'a': True,
  'church': False,
  'party': False,
  ',': True,
  'drink': False,
  'and': True,
  'then': False,
  'drive': False,
  '.': True,
  'they': True,
  'get': True,
  'into': True,
  'an': True,
  'accident': False,
  'one': True,
  'of': True,
  'the': True,
  'guys': False,
  'dies': False,
  'but': True,
  'his': True,
  'girlfriend': True,
  'continues': False,
  'see': False,
  'him': True,
  'in': True,
  'her': True,
  'life': True,
  'has': True,
  'nightmares': False,
  'what': True,
  "'": True,
  's': True,
  'deal': False,
  '?': False,
  'watch': False,
  'movie': False,
  '"': False,
  'sorta': False,
  'find': True,
  'out': True,
  'critique': False,
  'mind': False,
  '-': True,
  'fuck': False,
  'for': True,
  'generation': False,
  'that': True,
  'touches': False,
  'on': True,
  'very': True,
  'cool': False,
  'idea': True,
  'presents': False,

In [50]:
# set that we'll train our classifier with
training_set = featuresets[:1900]

# set that we'll test against.
testing_set = featuresets[1900:]

In [51]:
classifier = nltk.NaiveBayesClassifier.train(training_set)


In [52]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)

Classifier accuracy percent: 81.0


In [53]:
#saving the model using pickle
import pickle
save_classifier = open("naivebayes.pickle","wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [54]:
#you can load it using the following code when needed
# classifier_f = open("naivebayes.pickle", "rb")
# classifier = pickle.load(classifier_f)
# classifier_f.close()

In [55]:
#SklearnClassifiers
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB

In [56]:
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MultinomialNB accuracy percent:",nltk.classify.accuracy(MNB_classifier, testing_set))

BNB_classifier = SklearnClassifier(BernoulliNB())
BNB_classifier.train(training_set)
print("BernoulliNB accuracy percent:",nltk.classify.accuracy(BNB_classifier, testing_set))

MultinomialNB accuracy percent: 0.82
BernoulliNB accuracy percent: 0.81


In [57]:
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [58]:
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)

Original Naive Bayes Algo accuracy percent: 81.0
Most Informative Features
                   sucks = True              neg : pos    =     10.1 : 1.0
                  annual = True              pos : neg    =      9.7 : 1.0
                  justin = True              neg : pos    =      8.9 : 1.0
                 frances = True              pos : neg    =      8.4 : 1.0
           unimaginative = True              neg : pos    =      8.3 : 1.0
                  turkey = True              neg : pos    =      7.8 : 1.0
             silverstone = True              neg : pos    =      7.6 : 1.0
                 idiotic = True              neg : pos    =      7.4 : 1.0
              schumacher = True              neg : pos    =      7.3 : 1.0
                  regard = True              pos : neg    =      7.1 : 1.0
                    mena = True              neg : pos    =      6.9 : 1.0
                  shoddy = True              neg : pos    =      6.9 : 1.0
                  suvari 



LogisticRegression_classifier accuracy percent: 84.0
SGDClassifier_classifier accuracy percent: 81.0




SVC_classifier accuracy percent: 78.0
LinearSVC_classifier accuracy percent: 83.0




NuSVC_classifier accuracy percent: 83.0


In [61]:
import collections

In [62]:
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)

In [64]:
for i, (feats, label) in enumerate(testing_set):
    refsets[label].add(i)
    observed = NuSVC_classifier.classify(feats)
    testsets[observed].add(i)

In [74]:
from sklearn.metrics import classification_report


In [94]:
from nltk import precision
print( 'Precision:', precision(refsets['pos'], testsets['pos']) )
p = precision(refsets['pos'], testsets['pos'])
from nltk import recall
print( 'Recall:', recall(refsets['pos'], testsets['pos']) )
r = recall(refsets['pos'], testsets['pos'])
f1pos = 2 * (p * r) / (p + r)

Precision: 0.7966101694915254
Recall: 0.8703703703703703


In [89]:
print('f1 pos:',2 * (p * r) / (p + r))

f1 pos: 0.8318584070796461


In [90]:
len(refsets['pos'])

54

In [91]:
len(refsets['neg'])

46

In [92]:
print( 'Precision:', precision(refsets['neg'], testsets['neg']) )
p = precision(refsets['neg'], testsets['neg'])
from nltk import recall
print( 'Recall:', recall(refsets['neg'], testsets['neg']) )
r = recall(refsets['neg'], testsets['neg'])
f1neg = 2 * (p * r) / (p + r)

Precision: 0.7843137254901961
Recall: 0.8695652173913043


In [95]:
f1_score = (f1pos *(len(refsets['pos'])/(len(refsets['pos'])+len(refsets['neg'])))) + (f1neg *(len(refsets['neg'])/(len(refsets['pos'])+len(refsets['neg'])))) 

In [96]:
print('f1_score is:',f1_score)

f1_score is: 0.828584983121978
