# 7.6 Lab Session: Classification in NLTK
Classify Sentiment given document
Week 7 · Discourse and Dialogue

In this exercise we try different feature vector sizes and compare accuracy of models from each.

In [3]:
### classify documents based on keywords
from nltk.corpus import movie_reviews
import nltk
import random
# for each document in movie_reviews, get its words and category (positive/negative)
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
## use words from all documents to define the word vector for features
# get all words from all movie_reviews and put into a frequency distribution
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
print("Unique count of all words in documents: {:d}".format(len(all_words)))


Unique count of all words in documents: 39768


In [7]:
# define features (keywords) of a document
# each feature is 'contains(keyword)' and is true or false depending
# on whether that keyword is in the document
def document_features(document, word_features):
	document_words = set(document)
	features = {}
	for word in word_features:
		features['V_%s' % word] = (word in document_words)
	return features

## Model with Features: Top 2000 Terms

In [18]:
# get the 2000 most frequently appearing keywords in the corpus
n = 2000
word_items = all_words.most_common(n)
word_features = [word for (word, freq) in word_items]
# get features sets for a document, including keyword features and category feature
featuresets = [(document_features(d, word_features), c) for (d,c) in documents]
# training using naive Baysian classifier with a 95/5 split
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
# evaluate the accuracy of the classifier
print ("\nAccuracy of fv_size = {:d}: {:.2f}".format(n,  nltk.classify.accuracy(classifier, test_set)))


Accuracy of fv_size = 2000: 0.79


## Features: Top 3000 Terms

In [19]:
# get the 3000 most frequently appearing keywords in the corpus
n = 3000
word_items = all_words.most_common(n)
word_features = [word for (word, freq) in word_items]
# get features sets for a document, including keyword features and category feature
featuresets = [(document_features(d, word_features), c) for (d,c) in documents]
# training using naive Baysian classifier with a 95/5 split
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
# evaluate the accuracy of the classifier
print ("\nAccuracy of fv_size = {:d}: {:.2f}".format(n,  nltk.classify.accuracy(classifier, test_set)))


Accuracy of fv_size = 3000: 0.79


## Features: Top 1000 Terms

In [20]:
# get the 1000 most frequently appearing keywords in the corpus
n = 1000
word_items = all_words.most_common(n)
word_features = [word for (word, freq) in word_items]
# get features sets for a document, including keyword features and category feature
featuresets = [(document_features(d, word_features), c) for (d,c) in documents]
# training using naive Baysian classifier with a 95/5 split
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
# evaluate the accuracy of the classifier
print ("\nAccuracy of fv_size = {:d}: {:.2f}".format(n,  nltk.classify.accuracy(classifier, test_set)))


Accuracy of fv_size = 1000: 0.77


## Features: Top 4000 Terms

In [21]:
# get the 4000 most frequently appearing keywords in the corpus
n = 4000
word_items = all_words.most_common(n)
word_features = [word for (word, freq) in word_items]
# get features sets for a document, including keyword features and category feature
featuresets = [(document_features(d, word_features), c) for (d,c) in documents]
# training using naive Baysian classifier with a 95/5 split
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
# evaluate the accuracy of the classifier
print ("\nAccuracy of fv_size = {:d}: {:.2f}".format(n,  nltk.classify.accuracy(classifier, test_set)))


Accuracy of fv_size = 4000: 0.81
