In [1]:
import pickle
import nltk
import random

from nltk.classify.scikitlearn import SklearnClassifier
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.classify import ClassifierI
from nltk.corpus import movie_reviews

from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

from statistics import mode

In [2]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers
    
    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)
    
    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

In [3]:
short_pos = open("short_reviews_dataset/positive.txt", "r").read()
short_neg = open("short_reviews_dataset/negative.txt", "r").read()

In [4]:
documents = []
all_words = []

# allowing only specific types of words using nltk's POS tagger
# j is adject, r is adverb, and v is verb

# allowed_word_types = ["J","R","V"]
allowed_word_types = ["J"]

for p in short_pos.split('\n'):
    documents.append( (p, "pos") )
    words = word_tokenize(p)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())

    
for p in short_neg.split('\n'):
    documents.append( (p, "neg") )
    words = word_tokenize(p)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())

            
            
# for r in short_pos.split('\n'):
#     documents.append((r, "pos"))
    
# for r in short_neg.split('\n'):
#     documents.append((r, "neg"))

# short_pos_words = word_tokenize(short_pos)
# short_neg_words = word_tokenize(short_neg)

# for w in short_pos_words:
#     all_words.append(w.lower())
    
# for w in short_neg_words:
#     all_words.append(w.lower())
    
all_words = nltk.FreqDist(all_words)

In [5]:
word_features = list(all_words.keys())[:5000]

In [6]:
def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
        
    return features

In [7]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]

random.shuffle(featuresets)

In [8]:
training_set = featuresets[:10000]
testing_set = featuresets[10000:]

In [9]:
classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Naive Bayes Accuracy: ", (nltk.classify.accuracy(classifier, testing_set)*100))

Naive Bayes Accuracy:  70.33132530120481


In [10]:
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier Accuracy: ", (nltk.classify.accuracy(MNB_classifier, testing_set)*100))

MNB_classifier Accuracy:  71.23493975903614


In [11]:
# BernoulliNB
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier Accuracy: ", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set)*100))

BernoulliNB_classifier Accuracy:  70.18072289156626


In [12]:
# LogisticRegression
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier Accuracy: ", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set)*100))

LogisticRegression_classifier Accuracy:  70.33132530120481


In [13]:
# SGDClassifier
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier Accuracy: ", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set)*100))

SGDClassifier_classifier Accuracy:  69.87951807228916


In [14]:
# SVC
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier Accuracy: ", (nltk.classify.accuracy(SVC_classifier, testing_set)*100))

SVC_classifier Accuracy:  70.48192771084338


In [15]:
# LinearSVC
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier Accuracy: ", (nltk.classify.accuracy(LinearSVC_classifier, testing_set)*100))

LinearSVC_classifier Accuracy:  68.22289156626506


In [16]:
# # NuSVC, taking way too long to train
# NuSVC_classifier = SklearnClassifier(NuSVC())
# NuSVC_classifier.train(training_set)
# print("NuSVC_classifier Accuracy: ", (nltk.classify.accuracy(NuSVC_classifier, testing_set)*100))

In [19]:
voted_classifier = VoteClassifier(classifier,
                                  LinearSVC_classifier, 
                                  MNB_classifier, 
                                  BernoulliNB_classifier, 
                                  LogisticRegression_classifier)
print("voted_classifier Accuracy: ", (nltk.classify.accuracy(voted_classifier, testing_set)*100))
print("Classification: ", voted_classifier.classify(testing_set[0][0]), "Confidence: ", voted_classifier.confidence(testing_set[0][0]))

voted_classifier Accuracy:  70.33132530120481
Classification:  pos Confidence:  1.0


# Pickling to make classification fast and making a module out of our classifier


Things to pickle

- documents
- all_words
- word_features
- featuresets
- all the classifiers

Leave

- voted_classifier

In [27]:
p = open('pickled_files/documents.pickle', 'wb')
pickle.dump(documents, p)
p.close()

p = open('pickled_files/all_words.pickle', 'wb')
pickle.dump(all_words, p)
p.close()

p = open('pickled_files/word_features5k.pickle', 'wb')
pickle.dump(word_features, p)
p.close()

p = open('pickled_files/featuresets.pickle', 'wb')
pickle.dump(featuresets, p)
p.close()

# pickling the classifiers
p = open('pickled_files/originalnaivebayes5k.pickle', 'wb')
pickle.dump(classifier, p)
p.close()

p = open('pickled_files/MNB_classifier5k.pickle', 'wb')
pickle.dump(MNB_classifier, p)
p.close()

p = open('pickled_files/BernoulliNB_classifier5k.pickle', 'wb')
pickle.dump(BernoulliNB_classifier, p)
p.close()

p = open('pickled_files/LogisticRegression_classifier5k.pickle', 'wb')
pickle.dump(LogisticRegression_classifier, p)
p.close()

p = open('pickled_files/LinearSVC_classifier5k.pickle', 'wb')
pickle.dump(LinearSVC_classifier, p)
p.close()

p = open('pickled_files/SGDC_classifier5k.pickle', 'wb')
pickle.dump(SGDClassifier_classifier, p)
p.close()

p = open('pickled_files/SVC_classifier5k.pickle', 'wb')
pickle.dump(SVC_classifier, p)
p.close()

# Using the information above to create a module

- see "sentiment_analysis" python file

# Using the sentiment_analysis module

In [28]:
import sentiment_analysis as s

In [29]:
print(s.sentiment("This movie was awesome! The acting was great, plot was wonderful, and there were pythons...so yea!"))
print(s.sentiment("This movie was utter junk. There were absolutely 0 pythons. I don't see what the point was at all. Horrible movie, 0/10"))

('pos', 1.0)
('neg', 1.0)
