In [1]:
import nltk

def format_sentence(sent):
    return({word: True for word in nltk.word_tokenize(sent)})

print(format_sentence("The cat is very cute"))

{'The': True, 'cat': True, 'is': True, 'very': True, 'cute': True}


In [2]:
pos = []
with open("./data/pos_reviews.txt") as f:
    for i in f:
        pos.append([format_sentence(i), 'pos'])
        
neg = []
with open("./data/neg_reviews.txt") as f:
    for i in f:
        neg.append([format_sentence(i), 'neg'])
        
training_set_size = .8
pos_training_cutoff = int((training_set_size)*len(pos))
neg_training_cutoff = int((training_set_size)*len(neg))
        
training_set = pos[:pos_training_cutoff] + neg[:neg_training_cutoff]
test_set = pos[pos_training_cutoff:] + neg[neg_training_cutoff:]
print('train on %d instances, test on %d instances' % (len(training_set), len(test_set)))

train on 7226 instances, test on 1808 instances


In [3]:
from nltk.classify import NaiveBayesClassifier

classifier = NaiveBayesClassifier.train(training_set)
classifier.show_most_informative_features()

Most Informative Features
                Superman = True              neg : pos    =     46.5 : 1.0
                   AVOID = True              neg : pos    =     40.3 : 1.0
            unacceptable = True              neg : pos    =     40.3 : 1.0
                   Titus = True              neg : pos    =     34.1 : 1.0
              IMPOSSIBLE = True              neg : pos    =     34.1 : 1.0
                     WTF = True              neg : pos    =     34.1 : 1.0
                   WORST = True              neg : pos    =     31.6 : 1.0
                   Regis = True              neg : pos    =     27.9 : 1.0
                tag-team = True              neg : pos    =     27.9 : 1.0
             uninstalled = True              neg : pos    =     27.9 : 1.0


In [4]:
testPos = "Dogs are awesome!"
testNeg = "This game is trash"
testDoubleNeg = "not bad"

print(classifier.classify(format_sentence(testPos)))
print(classifier.classify(format_sentence(testNeg)))
print(classifier.classify(format_sentence(testDoubleNeg)))



pos
neg
pos


In [13]:
import collections
from nltk.metrics.scores import (precision, recall, f_measure)

ref_sets = collections.defaultdict(set)
test_sets = collections.defaultdict(set)

for i, (feats, label) in enumerate(test_set):
    ref_sets[label].add(i)
    observed = classifier.classify(feats)
    test_sets[observed].add(i)
    
print('Positive precision: ' + str(precision(ref_sets['pos'], test_sets['pos'])))
print('Positive recall: ' + str(recall(ref_sets['pos'], test_sets['pos'])))
print('Positive F-Measure: ' + str(f_measure(ref_sets['pos'], test_sets['pos'])))

print('Negative precision: ' + str(precision(ref_sets['neg'], test_sets['neg'])))
print('Negative recall: ' + str(recall(ref_sets['neg'], test_sets['neg'])))
print('Negative F-Measure: ' + str(f_measure(ref_sets['neg'], test_sets['neg'])))

Positive precision: 0.984304932735426
Positive recall: 0.5379901960784313
Positive F-Measure: 0.6957210776545166
Negative precision: 0.17685589519650655
Negative recall: 0.9204545454545454
Negative F-Measure: 0.2967032967032967


In [14]:
# Training on a medium-sized set
        
training_set_size2 = .6
pos_training_cutoff2 = int((training_set_size2)*len(pos))
neg_training_cutoff2 = int((training_set_size2)*len(neg))
        
training_set2 = pos[:pos_training_cutoff2] + neg[:neg_training_cutoff2]
test_set2 = pos[pos_training_cutoff2:] + neg[neg_training_cutoff2:]
print('train on %d instances, test on %d instances' % (len(training_set2), len(test_set2)))

train on 5420 instances, test on 3614 instances


In [16]:
classifier2 = NaiveBayesClassifier.train(training_set2)

In [20]:
ref_sets2 = collections.defaultdict(set)
test_sets2 = collections.defaultdict(set)

for i, (feats, label) in enumerate(test_set2):
    ref_sets2[label].add(i)
    observed2 = classifier2.classify(feats)
    test_sets2[observed2].add(i)
    
print('Positive precision: ' + str(precision(ref_sets2['pos'], test_sets2['pos'])))
print('Positive recall: ' + str(recall(ref_sets2['pos'], test_sets2['pos'])))
print('Positive F-Measure: ' + str(f_measure(ref_sets2['pos'], test_sets2['pos'])))

print('Negative precision: ' + str(precision(ref_sets2['neg'], test_sets2['neg'])))
print('Negative recall: ' + str(recall(ref_sets2['neg'], test_sets2['neg'])))
print('Negative F-Measure: ' + str(f_measure(ref_sets2['neg'], test_sets2['neg'])))

Positive precision: 0.9816922315685305
Positive recall: 0.6080294207784248
Positive F-Measure: 0.7509462528387586
Negative precision: 0.19711236660389203
Negative recall: 0.8945868945868946
Negative F-Measure: 0.3230452674897119


In [19]:
# Training on a small set
        
training_set_size3 = .4
pos_training_cutoff3 = int((training_set_size3)*len(pos))
neg_training_cutoff3 = int((training_set_size3)*len(neg))
        
training_set3 = pos[:pos_training_cutoff3] + neg[:neg_training_cutoff3]
test_set3 = pos[pos_training_cutoff3:] + neg[neg_training_cutoff3:]
print('train on %d instances, test on %d instances' % (len(training_set3), len(test_set3)))

train on 3612 instances, test on 5422 instances


In [21]:
classifier3 = NaiveBayesClassifier.train(training_set3)

In [22]:
ref_sets3 = collections.defaultdict(set)
test_sets3 = collections.defaultdict(set)

for i, (feats, label) in enumerate(test_set3):
    ref_sets3[label].add(i)
    observed3 = classifier3.classify(feats)
    test_sets3[observed3].add(i)
    
print('Positive precision: ' + str(precision(ref_sets3['pos'], test_sets3['pos'])))
print('Positive recall: ' + str(recall(ref_sets3['pos'], test_sets3['pos'])))
print('Positive F-Measure: ' + str(f_measure(ref_sets3['pos'], test_sets3['pos'])))

print('Negative precision: ' + str(precision(ref_sets3['neg'], test_sets3['neg'])))
print('Negative recall: ' + str(recall(ref_sets3['neg'], test_sets3['neg'])))
print('Negative F-Measure: ' + str(f_measure(ref_sets3['neg'], test_sets3['neg'])))

Positive precision: 0.9877149877149877
Positive recall: 0.41062308478038817
Positive F-Measure: 0.5800865800865801
Negative precision: 0.14821375848833776
Negative recall: 0.952561669829222
Negative F-Measure: 0.2565150740929995


In [23]:
# Training on a very large set
        
training_set_size4 = .9
pos_training_cutoff4 = int((training_set_size4)*len(pos))
neg_training_cutoff4 = int((training_set_size4)*len(neg))
        
training_set4 = pos[:pos_training_cutoff4] + neg[:neg_training_cutoff4]
test_set4 = pos[pos_training_cutoff4:] + neg[neg_training_cutoff4:]
print('train on %d instances, test on %d instances' % (len(training_set4), len(test_set4)))

train on 8130 instances, test on 904 instances


In [24]:
classifier4 = NaiveBayesClassifier.train(training_set4)

In [25]:
ref_sets4 = collections.defaultdict(set)
test_sets4 = collections.defaultdict(set)

for i, (feats, label) in enumerate(test_set4):
    ref_sets4[label].add(i)
    observed4 = classifier4.classify(feats)
    test_sets4[observed4].add(i)
    
print('Positive precision: ' + str(precision(ref_sets4['pos'], test_sets4['pos'])))
print('Positive recall: ' + str(recall(ref_sets4['pos'], test_sets4['pos'])))
print('Positive F-Measure: ' + str(f_measure(ref_sets4['pos'], test_sets4['pos'])))

print('Negative precision: ' + str(precision(ref_sets4['neg'], test_sets3['neg'])))
print('Negative recall: ' + str(recall(ref_sets4['neg'], test_sets4['neg'])))
print('Negative F-Measure: ' + str(f_measure(ref_sets4['neg'], test_sets4['neg'])))

Positive precision: 0.9879227053140096
Positive recall: 0.5012254901960784
Positive F-Measure: 0.6650406504065041
Negative precision: 0.020076764098021848
Negative recall: 0.9431818181818182
Negative F-Measure: 0.28719723183391005
