In [1]:
import nltk
import random

from nltk.corpus import movie_reviews

import pickle

from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

from nltk.classify import ClassifierI
from statistics import mode 

In [2]:
class VoteClassifier(ClassifierI):
    
    def __init__(self, *classifiers):
        self._classifiers = classifiers
    
    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)
    
    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        
        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

In [3]:
documents = [(list(movie_reviews.words(fileid)),category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [4]:
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)

In [5]:
word_features = list(all_words.keys())[:3000]

In [6]:
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words) 
    return features

In [7]:
#print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))

In [8]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]

In [9]:
training_set = featuresets[:1950] 
testing_set = featuresets[1950:]

In [10]:
#classifier = nltk.NaiveBayesClassifier.train(training_set)
classifier_f = open("naivebayes.pickle","rb")
classifier = pickle.load(classifier_f)
classifier_f.close()
print("Original Naive Bayes Algo Accuracy : ",(nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

Original Naive Bayes Algo Accuracy :  86.0
Most Informative Features
                   sucks = True              neg : pos    =     10.3 : 1.0
                  annual = True              pos : neg    =      8.9 : 1.0
           unimaginative = True              neg : pos    =      8.4 : 1.0
                 frances = True              pos : neg    =      8.2 : 1.0
             silverstone = True              neg : pos    =      7.8 : 1.0
                  suvari = True              neg : pos    =      7.1 : 1.0
                    mena = True              neg : pos    =      7.1 : 1.0
                  shoddy = True              neg : pos    =      7.1 : 1.0
                  alicia = True              neg : pos    =      7.1 : 1.0
               atrocious = True              neg : pos    =      6.7 : 1.0
              schumacher = True              neg : pos    =      6.7 : 1.0
                  turkey = True              neg : pos    =      6.6 : 1.0
                 idiotic = True

In [11]:
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_Classifier Algo Accuracy : ",(nltk.classify.accuracy(MNB_classifier, testing_set))*100)

MNB_Classifier Algo Accuracy :  76.0


In [12]:
#GaussianNB_classifier = SklearnClassifier(GaussianNB())
#GaussianNB_classifier.train(training_set)
#print("GaussianNB_classifier Algo Accuracy : ",(nltk.classify.accuracy(GaussianNB_classifier, testing_set))*100)

In [13]:
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier Algo Accuracy : ",(nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

BernoulliNB_classifier Algo Accuracy :  76.0


In [14]:
#LogisticRegression, SGDClassifier
#SVC, LinearSVC, NuSVC

In [15]:
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier Algo Accuracy : ",(nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

LogisticRegression_classifier Algo Accuracy :  78.0


In [16]:
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier Algo Accuracy : ",(nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

SGDClassifier_classifier Algo Accuracy :  82.0


In [17]:
#SVC_classifier = SklearnClassifier(SVC())
#SVC_classifier.train(training_set)
#print("SVC_classifier Algo Accuracy : ",(nltk.classify.accuracy(SVC_classifier, testing_set))*100)

In [18]:
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier Algo Accuracy : ",(nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

LinearSVC_classifier Algo Accuracy :  80.0


In [19]:
NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier Algo Accuracy : ",(nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)

NuSVC_classifier Algo Accuracy :  82.0


In [21]:
voted_classifier = VoteClassifier(classifier,
                                  MNB_classifier,
                                  BernoulliNB_classifier,
                                  LogisticRegression_classifier,
                                  SGDClassifier_classifier,
                                  LinearSVC_classifier,
                                  NuSVC_classifier)
print("voted_classifier Algo Accuracy : ",(nltk.classify.accuracy(voted_classifier, testing_set))*100)

print("Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:", voted_classifier.confidence(testing_set[0][0])*100)
print("Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:", voted_classifier.confidence(testing_set[1][0])*100)
print("Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:", voted_classifier.confidence(testing_set[2][0])*100)
print("Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:", voted_classifier.confidence(testing_set[3][0])*100)
print("Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:", voted_classifier.confidence(testing_set[4][0])*100)
print("Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:", voted_classifier.confidence(testing_set[5][0])*100)



voted_classifier Algo Accuracy :  84.0
Classification: neg Confidence %: 100.0
Classification: neg Confidence %: 100.0
Classification: neg Confidence %: 100.0
Classification: neg Confidence %: 85.71428571428571
Classification: neg Confidence %: 100.0
Classification: pos Confidence %: 100.0
