# Scikit-Learn Sklearn with NLTK

In [1]:
import nltk 
import random
from nltk.corpus import movie_reviews
import pickle

In [2]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

all_words = []

for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:3000]

In [3]:
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

In [4]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]

In [5]:
# set that we'll train our classifier with
training_set = featuresets[:1900]

# set that we'll test against.
testing_set = featuresets[1900:]

In [6]:
#classifier = nltk.NaiveBayesClassifier.train(training_set)
classifier_f = open("naivebayes.pickle", "rb")
classifier = pickle.load(classifier_f)
classifier_f.close()

In [7]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier,testing_set))*100)

Classifier accuracy percent: 65.0


We've seen by now how easy it can be to use classifiers out of the box, and now we want to try some more! The best module for Python to do this with is the Scikit-learn (sklearn) module.

Luckily for us, the people behind NLTK forsaw the value of incorporating the sklearn module into the NLTK classifier methodology. As such, they created the SklearnClassifier API of sorts. To use that, you just need to import it like:





In [8]:
from nltk.classify.scikitlearn import SklearnClassifier

From here, you can use just about any of the sklearn classifiers. For example, lets bring in a couple more variations of the Naive Bayes algorithm:

In [9]:
from sklearn.naive_bayes import MultinomialNB,BernoulliNB

With this, how might we use them? It turns out, this is very simple:

# MultinomialNB

In [10]:
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MultinomialNB accuracy percent:",nltk.classify.accuracy(MNB_classifier,testing_set))

MultinomialNB accuracy percent: 0.73


# BernoulliNB

In [11]:
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB accuracy percent:",nltk.classify.accuracy(BernoulliNB_classifier,testing_set))

BernoulliNB accuracy percent: 0.69


It is as simple as that. Let's bring in some more:



In [12]:
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

Now, all of our classifiers should look something like:

In [13]:
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)

MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)

SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)

LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)

Original Naive Bayes Algo accuracy percent: 65.0
MNB_classifier accuracy percent: 73.0
BernoulliNB_classifier accuracy percent: 69.0
LogisticRegression_classifier accuracy percent: 77.0
SGDClassifier_classifier accuracy percent: 69.0
SVC_classifier accuracy percent: 46.0
LinearSVC_classifier accuracy percent: 74.0
NuSVC_classifier accuracy percent: 73.0


So, we can see SVC is wrong more often than it is right right out of the gate, so we should probably dump that one. But then what? The next thing we can try is to use all of these algorithms at once. An algo of algos! To do this, we can create another classifier, and make the result of that classifier based on what the other algorithms said. Sort of like a voting system, so we'll just need an odd number of algorithms.

In [15]:
testing_set

[({'browbeats': False,
   'midler': False,
   'chidduck': False,
   'pain': False,
   '_don': False,
   'byron': False,
   'ney': False,
   'oldest': False,
   'window': False,
   'beside': False,
   'goody': False,
   'solvable': False,
   'installing': False,
   'clue': False,
   '_everybody_': False,
   'schnazzy': False,
   'kentucky': False,
   'compardre': False,
   'lionel': False,
   'banner': False,
   'irradiation': False,
   'neccessary': False,
   'piaf': False,
   'overtones': False,
   'apathy': False,
   'contents': False,
   'stretches': False,
   'surpassing': False,
   'brainard': False,
   'symbolised': False,
   'posessed': False,
   'obtain': False,
   'droney': False,
   'rahul': False,
   'serviceable': False,
   'reasoning': False,
   'pidgeonhole': False,
   'sensuous': False,
   'vander': False,
   'brevity': False,
   'hostility': False,
   'corso': False,
   'styled': False,
   'frame': False,
   'jedda': False,
   'breakups': False,
   'cameraderie': False,