In [1]:
import nltk

def format_sentence(sent):
    return({word: True for word in nltk.word_tokenize(sent)})

print(format_sentence("The cat is very cute"))

{'The': True, 'cat': True, 'is': True, 'very': True, 'cute': True}


In [2]:
pos = []
with open("./data/pos_reviews_5k.txt") as f:
    for i in f:
        pos.append([format_sentence(i), 'pos'])
        
neg = []
with open("./data/neg_reviews_5k.txt") as f:
    for i in f:
        neg.append([format_sentence(i), 'neg'])
        
training_set_size = .8
pos_training_cutoff = int((training_set_size)*len(pos))
neg_training_cutoff = int((training_set_size)*len(neg))
        
training_set = pos[:pos_training_cutoff] + neg[:neg_training_cutoff]
test_set = pos[pos_training_cutoff:] + neg[neg_training_cutoff:]
print('train on %d instances, test on %d instances' % (len(training_set), len(test_set)))

train on 8000 instances, test on 2000 instances


In [3]:
from nltk.classify import NaiveBayesClassifier

classifier = NaiveBayesClassifier.train(training_set)
classifier.show_most_informative_features()

Most Informative Features
               redeeming = True              neg : pos    =     24.3 : 1.0
                   Avoid = True              neg : pos    =     23.0 : 1.0
                   Payne = True              neg : pos    =     22.3 : 1.0
                   WORST = True              neg : pos    =     22.3 : 1.0
                    GTA3 = True              neg : pos    =     21.0 : 1.0
                  Batman = True              neg : pos    =     21.0 : 1.0
                    Zerg = True              pos : neg    =     19.0 : 1.0
               atrocious = True              neg : pos    =     18.3 : 1.0
                   lousy = True              neg : pos    =     17.8 : 1.0
                Splinter = True              neg : pos    =     16.3 : 1.0


In [4]:
testPos = "Dogs are awesome!"
testNeg = "This game is trash"
testDoubleNeg = "not bad"

print(classifier.classify(format_sentence(testPos)))
print(classifier.classify(format_sentence(testNeg)))
print(classifier.classify(format_sentence(testDoubleNeg)))



pos
neg
neg


In [5]:
import collections
from nltk.metrics.scores import (precision, recall, f_measure)

ref_sets = collections.defaultdict(set)
test_sets = collections.defaultdict(set)

for i, (feats, label) in enumerate(test_set):
    ref_sets[label].add(i)
    observed = classifier.classify(feats)
    test_sets[observed].add(i)
    
print('Positive precision: ' + str(precision(ref_sets['pos'], test_sets['pos'])))
print('Positive recall: ' + str(recall(ref_sets['pos'], test_sets['pos'])))
print('Positive F-Measure: ' + str(f_measure(ref_sets['pos'], test_sets['pos'])))

print('Negative precision: ' + str(precision(ref_sets['neg'], test_sets['neg'])))
print('Negative recall: ' + str(recall(ref_sets['neg'], test_sets['neg'])))
print('Negative F-Measure: ' + str(f_measure(ref_sets['neg'], test_sets['neg'])))

Positive precision: 0.9670658682634731
Positive recall: 0.646
Positive F-Measure: 0.7745803357314149
Negative precision: 0.7342342342342343
Negative recall: 0.978
Negative F-Measure: 0.8387650085763293


In [6]:
# Training on a medium-sized set
        
training_set_size2 = .6
pos_training_cutoff2 = int((training_set_size2)*len(pos))
neg_training_cutoff2 = int((training_set_size2)*len(neg))
        
training_set2 = pos[:pos_training_cutoff2] + neg[:neg_training_cutoff2]
test_set2 = pos[pos_training_cutoff2:] + neg[neg_training_cutoff2:]
print('train on %d instances, test on %d instances' % (len(training_set2), len(test_set2)))

train on 6000 instances, test on 4000 instances


In [7]:
classifier2 = NaiveBayesClassifier.train(training_set2)

In [8]:
ref_sets2 = collections.defaultdict(set)
test_sets2 = collections.defaultdict(set)

for i, (feats, label) in enumerate(test_set2):
    ref_sets2[label].add(i)
    observed2 = classifier2.classify(feats)
    test_sets2[observed2].add(i)
    
print('Positive precision: ' + str(precision(ref_sets2['pos'], test_sets2['pos'])))
print('Positive recall: ' + str(recall(ref_sets2['pos'], test_sets2['pos'])))
print('Positive F-Measure: ' + str(f_measure(ref_sets2['pos'], test_sets2['pos'])))

print('Negative precision: ' + str(precision(ref_sets2['neg'], test_sets2['neg'])))
print('Negative recall: ' + str(recall(ref_sets2['neg'], test_sets2['neg'])))
print('Negative F-Measure: ' + str(f_measure(ref_sets2['neg'], test_sets2['neg'])))

Positive precision: 0.9434250764525994
Positive recall: 0.617
Positive F-Measure: 0.7460701330108828
Negative precision: 0.7154531946508172
Negative recall: 0.963
Negative F-Measure: 0.8209718670076727


In [9]:
# Training on a small set
        
training_set_size3 = .4
pos_training_cutoff3 = int((training_set_size3)*len(pos))
neg_training_cutoff3 = int((training_set_size3)*len(neg))
        
training_set3 = pos[:pos_training_cutoff3] + neg[:neg_training_cutoff3]
test_set3 = pos[pos_training_cutoff3:] + neg[neg_training_cutoff3:]
print('train on %d instances, test on %d instances' % (len(training_set3), len(test_set3)))

train on 4000 instances, test on 6000 instances


In [10]:
classifier3 = NaiveBayesClassifier.train(training_set3)

In [11]:
ref_sets3 = collections.defaultdict(set)
test_sets3 = collections.defaultdict(set)

for i, (feats, label) in enumerate(test_set3):
    ref_sets3[label].add(i)
    observed3 = classifier3.classify(feats)
    test_sets3[observed3].add(i)
    
print('Positive precision: ' + str(precision(ref_sets3['pos'], test_sets3['pos'])))
print('Positive recall: ' + str(recall(ref_sets3['pos'], test_sets3['pos'])))
print('Positive F-Measure: ' + str(f_measure(ref_sets3['pos'], test_sets3['pos'])))

print('Negative precision: ' + str(precision(ref_sets3['neg'], test_sets3['neg'])))
print('Negative recall: ' + str(recall(ref_sets3['neg'], test_sets3['neg'])))
print('Negative F-Measure: ' + str(f_measure(ref_sets3['neg'], test_sets3['neg'])))

Positive precision: 0.9548458149779736
Positive recall: 0.289
Positive F-Measure: 0.4437052200614125
Negative precision: 0.5811076197957581
Negative recall: 0.9863333333333333
Negative F-Measure: 0.731339594661394


In [12]:
# Training on a very large set
        
training_set_size4 = .9
pos_training_cutoff4 = int((training_set_size4)*len(pos))
neg_training_cutoff4 = int((training_set_size4)*len(neg))
        
training_set4 = pos[:pos_training_cutoff4] + neg[:neg_training_cutoff4]
test_set4 = pos[pos_training_cutoff4:] + neg[neg_training_cutoff4:]
print('train on %d instances, test on %d instances' % (len(training_set4), len(test_set4)))

train on 9000 instances, test on 1000 instances


In [13]:
classifier4 = NaiveBayesClassifier.train(training_set4)

In [14]:
ref_sets4 = collections.defaultdict(set)
test_sets4 = collections.defaultdict(set)

for i, (feats, label) in enumerate(test_set4):
    ref_sets4[label].add(i)
    observed4 = classifier4.classify(feats)
    test_sets4[observed4].add(i)
    
print('Positive precision: ' + str(precision(ref_sets4['pos'], test_sets4['pos'])))
print('Positive recall: ' + str(recall(ref_sets4['pos'], test_sets4['pos'])))
print('Positive F-Measure: ' + str(f_measure(ref_sets4['pos'], test_sets4['pos'])))

print('Negative precision: ' + str(precision(ref_sets4['neg'], test_sets3['neg'])))
print('Negative recall: ' + str(recall(ref_sets4['neg'], test_sets4['neg'])))
print('Negative F-Measure: ' + str(f_measure(ref_sets4['neg'], test_sets4['neg'])))

Positive precision: 0.9708454810495627
Positive recall: 0.666
Positive F-Measure: 0.7900355871886121
Negative precision: 0.08032207384131972
Negative recall: 0.98
Negative F-Measure: 0.8470181503889369
