In [1]:
import collections, itertools
import nltk.classify.util, nltk.metrics
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews, stopwords
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist

def evaluate_classifier(featx):
	negids = movie_reviews.fileids('neg')
	posids = movie_reviews.fileids('pos')

	negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
	posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids]

	negcutoff = len(negfeats)*3/4
	poscutoff = len(posfeats)*3/4

	trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
	testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]

	classifier = NaiveBayesClassifier.train(trainfeats)
	refsets = collections.defaultdict(set)
	testsets = collections.defaultdict(set)

	for i, (feats, label) in enumerate(testfeats):
			refsets[label].add(i)
			observed = classifier.classify(feats)
			testsets[observed].add(i)

	print 'accuracy:', nltk.classify.util.accuracy(classifier, testfeats)
	print 'pos precision:', nltk.metrics.precision(refsets['pos'], testsets['pos'])
	print 'pos recall:', nltk.metrics.recall(refsets['pos'], testsets['pos'])
	print 'neg precision:', nltk.metrics.precision(refsets['neg'], testsets['neg'])
	print 'neg recall:', nltk.metrics.recall(refsets['neg'], testsets['neg'])
	classifier.show_most_informative_features()

def word_feats(words):
	return dict([(word, True) for word in words])

print 'evaluating single word features'
evaluate_classifier(word_feats)

word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()

for word in movie_reviews.words(categories=['pos']):
	word_fd.inc(word.lower())
	label_word_fd['pos'].inc(word.lower())

for word in movie_reviews.words(categories=['neg']):
	word_fd.inc(word.lower())
	label_word_fd['neg'].inc(word.lower())

# n_ii = label_word_fd[label][word]
# n_ix = word_fd[word]
# n_xi = label_word_fd[label].N()
# n_xx = label_word_fd.N()

pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count

word_scores = {}

for word, freq in word_fd.iteritems():
	pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
		(freq, pos_word_count), total_word_count)
	neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
		(freq, neg_word_count), total_word_count)
	word_scores[word] = pos_score + neg_score

best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]
bestwords = set([w for w, s in best])

def best_word_feats(words):
	return dict([(word, True) for word in words if word in bestwords])

print 'evaluating best word features'
evaluate_classifier(best_word_feats)

def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
	bigram_finder = BigramCollocationFinder.from_words(words)
	bigrams = bigram_finder.nbest(score_fn, n)
	d = dict([(bigram, True) for bigram in bigrams])
	d.update(best_word_feats(words))
	return d

print 'evaluating best words + bigram chi_sq word features'
evaluate_classifier(best_bigram_word_feats)

evaluating single word features
accuracy: 0.728
pos precision: 0.651595744681
pos recall: 0.98
neg precision: 0.959677419355
neg recall: 0.476
Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
                 idiotic = True              neg : pos    =      9.8 : 1.0


AttributeError: 'FreqDist' object has no attribute 'inc'

In [2]:
movie_reviews

<CategorizedPlaintextCorpusReader in u'C:\\Users\\Wotashu\\AppData\\Roaming\\nltk_data\\corpora\\movie_reviews'>

In [4]:
from textblob import TextBlob

In [5]:
text = '''
The titular threat of The Blob has always struck me as the ultimate movie
monster: an insatiably hungry, amoeba-like mass able to penetrate
virtually any safeguard, capable of--as a doomed doctor chillingly
describes it--"assimilating flesh on contact.
Snide comparisons to gelatin be damned, it's a concept with the most
devastating of potential consequences, not unlike the grey goo scenario
proposed by technological theorists fearful of
artificial intelligence run rampant.
'''

In [6]:
blob = TextBlob(text)

In [7]:
blob.tags           

[(u'The', u'DT'),
 (u'titular', u'JJ'),
 (u'threat', u'NN'),
 (u'of', u'IN'),
 (u'The', u'DT'),
 (u'Blob', u'NNP'),
 (u'has', u'VBZ'),
 (u'always', u'RB'),
 (u'struck', u'VBD'),
 (u'me', u'PRP'),
 (u'as', u'IN'),
 (u'the', u'DT'),
 (u'ultimate', u'JJ'),
 (u'movie', u'NN'),
 (u'monster', u'NN'),
 (u'an', u'DT'),
 (u'insatiably', u'RB'),
 (u'hungry', u'JJ'),
 (u'amoeba-like', u'JJ'),
 (u'mass', u'NN'),
 (u'able', u'JJ'),
 (u'to', u'TO'),
 (u'penetrate', u'VB'),
 (u'virtually', u'RB'),
 (u'any', u'DT'),
 (u'safeguard', u'VB'),
 (u'capable', u'JJ'),
 (u'of--as', u'JJ'),
 (u'a', u'DT'),
 (u'doomed', u'VBN'),
 (u'doctor', u'NN'),
 (u'chillingly', u'RB'),
 (u'describes', u'VBZ'),
 (u'it', u'PRP'),
 (u'assimilating', u'VBG'),
 (u'flesh', u'NN'),
 (u'on', u'IN'),
 (u'contact', u'NN'),
 (u'Snide', u'NNP'),
 (u'comparisons', u'NNS'),
 (u'to', u'TO'),
 (u'gelatin', u'NN'),
 (u'be', u'VB'),
 (u'damned', u'JJ'),
 (u'it', u'PRP'),
 (u"'", u'POS'),
 (u's', u'PRP'),
 (u'a', u'DT'),
 (u'concept', u'NN')

In [8]:
blob.noun_phrases

WordList([u'titular threat', 'blob', u'ultimate movie monster', u'amoeba-like mass', 'snide', u'potential consequences', u'grey goo scenario', u'technological theorists fearful', u'artificial intelligence run rampant'])

In [9]:
for sentence in blob.sentences:
    print(sentence.sentiment.polarity)

0.06
-0.341666666667
