In [43]:
# Code from http://streamhacker.com/2010/05/10/text-classification-sentiment-analysis-naive-bayes-classifier/
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
import collections
import nltk.metrics
from nltk import scores
import random
from sklearn.cross_validation import train_test_split

In [35]:
random.seed(88)

In [80]:
movie_reviews.words

<bound method CategorizedPlaintextCorpusReader.words of <CategorizedPlaintextCorpusReader in '/home/yuri/nltk_data/corpora/movie_reviews'>>

In [116]:
def word_feats(words):
    return dict([(word, True) for word in words])

In [117]:
negids = movie_reviews.fileids('neg')
posids = movie_reviews.fileids('pos')

In [95]:
negfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
posfeats = [(word_feats(movie_reviews.words(fileids=[f])), 'pos') for f in posids]

In [118]:
posids[:3]

['pos/cv000_29590.txt', 'pos/cv001_18431.txt', 'pos/cv002_15918.txt']

In [119]:
len(movie_reviews.words(fileids=['pos/cv000_29590.txt']))

862

In [121]:
movie_reviews.words(fileids=['pos/cv000_29590.txt'])[20:30]

['superman', ',', 'spawn', ')', ',', 'or', 'geared', 'toward', 'kids', '(']

In [132]:
negfeats[4];

In [127]:
negcutoff = int(len(negfeats)*3/4)
poscutoff = int(len(posfeats)*3/4)

trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
print('train on %d instances, test on %d instances' % (len(trainfeats), len(testfeats))) 

train on 1500 instances, test on 500 instances


In [128]:
classifier = NaiveBayesClassifier.train(trainfeats)
print('accuracy: ' + str(nltk.classify.util.accuracy(classifier, testfeats)))

accuracy: 0.728


In [129]:
classifier.show_most_informative_features()

Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
                 idiotic = True              neg : pos    =      9.8 : 1.0


In [108]:
dir(nltk.metrics);

In [130]:
# Precision and recall
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)

for i, (feats, label) in enumerate(testfeats):
    refsets[label].add(i)
    observed = classifier.classify(feats)
    testsets[observed].add(i)
 
print('pos precision:'+ str(scores.precision(refsets['pos'], testsets['pos'])))
print('pos recall:' + str(scores.recall(refsets['pos'], testsets['pos'])))
print('pos F-measure:' + str(scores.f_measure(refsets['pos'], testsets['pos'])))
print('neg precision:' + str(scores.precision(refsets['neg'], testsets['neg'])))
print('neg recall:' + str(scores.recall(refsets['neg'], testsets['neg'])))
print('neg F-measure:' + str(scores.f_measure(refsets['neg'], testsets['neg'])))

pos precision:0.651595744680851
pos recall:0.98
pos F-measure:0.7827476038338657
neg precision:0.9596774193548387
neg recall:0.476
neg F-measure:0.6363636363636364


In [11]:
# Function to evaluate features
def evaluate_classifier(featx):
    negids = movie_reviews.fileids('neg')
    posids = movie_reviews.fileids('pos')
 
    negfeats = [(featx(movie_reviews.words(fileids=[f])), 'neg') for f in negids]
    posfeats = [(featx(movie_reviews.words(fileids=[f])), 'pos') for f in posids]
 
    negcutoff = int(len(negfeats) * 3/4)
    poscutoff = int(len(negfeats) * 3/4)
 
    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:]
 
    classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets = collections.defaultdict(set)
 
    for i, (feats, label) in enumerate(testfeats):
            refsets[label].add(i)
            observed = classifier.classify(feats)
            testsets[observed].add(i)
 
    print('accuracy:' + str(nltk.classify.util.accuracy(classifier, testfeats)))
    print('pos precision:'+ str(scores.precision(refsets['pos'], testsets['pos'])))
    print('pos recall:' + str(scores.recall(refsets['pos'], testsets['pos'])))
    print('neg precision:' + str(scores.precision(refsets['neg'], testsets['neg'])))
    print('neg recall:' + str(scores.recall(refsets['neg'], testsets['neg'])))
    classifier.show_most_informative_features()

In [12]:
refsets['neg'];

In [13]:
evaluate_classifier(word_feats)

accuracy:0.728
pos precision:0.651595744680851
pos recall:0.98
neg precision:0.9596774193548387
neg recall:0.476
Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
                 idiotic = True              neg : pos    =      9.8 : 1.0


# Stopword filtering

In [14]:
from nltk.corpus import stopwords
stopset = set(stopwords.words('english'))

In [16]:
stopset;

In [17]:
def stopword_filtered_word_feats(words):
    return dict([(word, True) for word in words if word not in stopset])
 
evaluate_classifier(stopword_filtered_word_feats)

accuracy:0.724
pos precision:0.6473684210526316
pos recall:0.984
neg precision:0.9666666666666667
neg recall:0.464
Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
                 idiotic = True              neg : pos    =      9.8 : 1.0


# Bigram Collocations

In [113]:
import itertools
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

In [114]:
def bigram_word_feats(words, score_fn = BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])
 
evaluate_classifier(bigram_word_feats)

accuracy:0.816
pos precision:0.7532051282051282
pos recall:0.94
neg precision:0.9202127659574468
neg recall:0.692
Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
       ('matt', 'damon') = True              pos : neg    =     12.3 : 1.0
          ('give', 'us') = True              neg : pos    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
    ('absolutely', 'no') = True              neg : pos    =     10.6 : 1.0


# Eliminate Low Information Features

In [78]:
# http://streamhacker.com/2010/06/16/text-classification-sentiment-analysis-eliminate-low-information-features/
import collections, itertools
import nltk.classify.util, nltk.metrics
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews, stopwords
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist

In [21]:
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
 
for word in movie_reviews.words(categories=['pos']):
    word_fd[word.lower()] += 1
    label_word_fd['pos'][word.lower()] += 1
 
for word in movie_reviews.words(categories=['neg']):
    word_fd[word.lower()] += 1
    label_word_fd['neg'][word.lower()] += 1

In [22]:
label_word_fd['pos']['magnificent'], \
word_fd['magnificent'], \
label_word_fd['pos'].N(), \
label_word_fd.N()

(33, 37, 832564, 1583820)

In [23]:
pos_word_count = label_word_fd['pos'].N()
neg_word_count = label_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count

pos_word_count, neg_word_count, total_word_count

(832564, 751256, 1583820)

In [24]:
word_fd.items(); # [('key', num_times_present), ]

word_fd['magnificent'], \
label_word_fd['pos']['magnificent'], \
label_word_fd['neg']['magnificent']

(37, 33, 4)

In [25]:
help(BigramAssocMeasures.chi_sq)

Help on method chi_sq in module nltk.metrics.association:

chi_sq(n_ii, n_ix_xi_tuple, n_xx) method of builtins.type instance
    Scores bigrams using chi-square, i.e. phi-sq multiplied by the number
    of bigrams, as in Manning and Schutze 5.3.3.



In [26]:
word_scores = {}
 
for word, freq in word_fd.items():
    pos_score = BigramAssocMeasures.chi_sq(label_word_fd['pos'][word],
        (freq, pos_word_count), total_word_count)
    
    neg_score = BigramAssocMeasures.chi_sq(label_word_fd['neg'][word],
        (freq, neg_word_count), total_word_count)
    
    word_scores[word] = pos_score + neg_score

In [27]:
word_scores.items();

In [135]:
#best = sorted(word_scores.iteritems(), key=lambda (w,s): s, reverse=True)[:10000]

best = sorted(word_scores.items(), \
              key=lambda s: s[1], \
              reverse=True)[:10000]
bestwords = set([w for w, s in best])
 
best[:15]
bestwords;

In [29]:
def best_word_feats(words):
    return dict([(word, True) for word in words if word in bestwords])
 
print('evaluating best word features')
evaluate_classifier(best_word_feats)

evaluating best word features
accuracy:0.932
pos precision:0.8941605839416058
pos recall:0.98
neg precision:0.9778761061946902
neg recall:0.884
Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
             fascination = True              pos : neg    =     10.3 : 1.0
              astounding = True              pos : neg    =     10.3 : 1.0
                 idiotic = True              neg : pos    =      9.8 : 1.0


In [30]:
def best_bigram_word_feats(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    d = dict([(bigram, True) for bigram in bigrams])
    d.update(best_word_feats(words))
    return d
 
print('evaluating best words + bigram chi_sq word features')
evaluate_classifier(best_bigram_word_feats)

evaluating best words + bigram chi_sq word features
accuracy:0.92
pos precision:0.9133858267716536
pos recall:0.928
neg precision:0.926829268292683
neg recall:0.912
Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
       ('matt', 'damon') = True              pos : neg    =     12.3 : 1.0
          ('give', 'us') = True              neg : pos    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0
             uninvolving = True              neg : pos    =     11.7 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
    ('absolutely', 'no') = True              neg : pos    =     10.6 : 1.0


# NLTK + Scikit-Learn