In [18]:
import nltk
from nltk.corpus import sentence_polarity
import random

In [19]:
documents = [(sent, cat) for cat in sentence_polarity.categories() 
    for sent in sentence_polarity.sents(categories=cat)]

random.shuffle(documents)

In [20]:
all_words_list = [word for (sent,cat) in documents for word in sent]
all_words = nltk.FreqDist(all_words_list)
word_items = all_words.most_common(1500)
word_features = [word for (word, freq) in word_items]

In [21]:
def document_features(document, word_features):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    return features

In [22]:
featuresets = [(document_features(d,word_features), c) for (d,c) in documents]
len(featuresets)

10662

In [23]:
train_set, test_set = featuresets[1000:], featuresets[:1000]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.749


In [24]:
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()

In [25]:
print(all_words_list[:50]
finder = BigramCollocationFinder.from_words(all_words_list)

['beautiful', ',', 'cold', ',', 'oddly', 'colorful', 'and', 'just', 'plain', 'otherworldly', ',', 'a', 'freaky', 'bit', 'of', 'art', "that's", 'there', 'to', 'scare', 'while', 'we', 'delight', 'in', 'the', 'images', '.', 'what', 'enlivens', 'this', 'film', ',', 'beyond', 'the', 'astute', 'direction', 'of', 'cardoso', 'and', 'beautifully', 'detailed', 'performances', 'by', 'all', 'of', 'the', 'actors', ',', 'is', 'a']


In [58]:
bigram_features = finder.nbest(bigram_measures.chi_sq, 500)
print(bigram_features[:50])
print(bigram_features[1:][:1])

[("''independent", "film''"), ("'60s-homage", 'pokepie'), ("'[the", 'cockettes]'), ("'ace", "ventura'"), ("'alternate", "reality'"), ("'aunque", 'recurre'), ("'black", "culture'"), ("'blue", "crush'"), ("'chan", "moment'"), ("'chick", "flicks'"), ("'date", "movie'"), ("'ethnic", 'cleansing'), ("'face", "value'"), ("'fully", "experienced'"), ("'jason", "x'"), ("'juvenile", "delinquent'"), ("'laugh", "therapy'"), ("'masterpiece", "theatre'"), ("'nicholas", "nickleby'"), ("'old", "neighborhood'"), ("'opening", "up'"), ("'rare", "birds'"), ("'sacre", 'bleu'), ("'science", "fiction'"), ("'shindler's", "list'"), ("'snow", "dogs'"), ("'some", "body'"), ("'special", "effects'"), ("'terrible", "filmmaking'"), ("'time", "waster'"), ("'true", "story'"), ("'unfaithful'", 'cheats'), ("'very", "sneaky'"), ("'we're", '-doing-it-for'), ("'who's", "who'"), ('-after', 'spangle'), ('-as-it-', 'thinks-it-is'), ('-as-nasty', '-as-it-'), ('-doing-it-for', "-the-cash'"), ('10-course', 'banquet'), ('10-year',

In [27]:
sent = ['Arthur','carefully','rode','the','brown','horse','around','the','castle']
sentbigrams = list(nltk.bigrams(sent))
print(sentbigrams)

[('Arthur', 'carefully'), ('carefully', 'rode'), ('rode', 'the'), ('the', 'brown'), ('brown', 'horse'), ('horse', 'around'), ('around', 'the'), ('the', 'castle')]


In [28]:
bigram = ('brown','horse')
print(bigram in sentbigrams)
print('B_{}_{}'.format(bigram[0], bigram[1]))

True
B_brown_horse


In [29]:
def bigram_document_features(document, word_features, bigram_features):
    document_words = set(document)
    document_bigrams = nltk.bigrams(document)
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    for bigram in bigram_features:
        features['B_{}_{}'.format(bigram[0], bigram[1])] = (bigram in document_bigrams)    
    return features

In [30]:
bigram_featuresets = [(bigram_document_features(d, word_features, bigram_features), c) for (d, c) in documents]
print(len(bigram_featuresets[0][0].keys()))
print(bigram_featuresets[0][0])

2000
{'V_.': True, 'V_the': True, 'V_,': True, 'V_a': True, 'V_and': True, 'V_of': True, 'V_to': True, 'V_is': False, 'V_in': True, 'V_that': False, 'V_it': False, 'V_as': False, 'V_but': False, 'V_with': False, 'V_film': False, 'V_this': False, 'V_for': False, 'V_its': False, 'V_an': False, 'V_movie': False, "V_it's": False, 'V_be': False, 'V_on': False, 'V_you': False, 'V_not': False, 'V_by': False, 'V_about': False, 'V_one': False, 'V_more': False, 'V_like': False, 'V_has': False, 'V_are': False, 'V_at': False, 'V_from': False, 'V_than': False, 'V_"': False, 'V_all': False, 'V_--': False, 'V_his': False, 'V_have': False, 'V_so': False, 'V_if': False, 'V_or': False, 'V_story': False, 'V_i': False, 'V_too': False, 'V_just': True, 'V_who': False, 'V_into': False, 'V_what': False, 'V_most': False, 'V_out': False, 'V_no': False, 'V_much': False, 'V_even': False, 'V_good': False, 'V_up': False, 'V_will': False, 'V_comedy': False, 'V_time': False, 'V_can': False, 'V_some': False, 'V_charac

In [31]:
train_set2, test_set2 = bigram_featuresets[1000:], bigram_featuresets[:1000]
classifier2 = nltk.NaiveBayesClassifier.train(train_set2)
nltk.classify.accuracy(classifier2, test_set2)

0.749

In [40]:
print(sent)
print(nltk.pos_tag(sent))

['Arthur', 'carefully', 'rode', 'the', 'brown', 'horse', 'around', 'the', 'castle']
[('Arthur', 'NNP'), ('carefully', 'RB'), ('rode', 'VBD'), ('the', 'DT'), ('brown', 'JJ'), ('horse', 'NN'), ('around', 'IN'), ('the', 'DT'), ('castle', 'NN')]


In [41]:
def POS_features(document, word_features):
    document_words = set(document)
    tagged_words = nltk.pos_tag(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    numNoun = 0
    numVerb = 0
    numAdj = 0
    numAdverb = 0
    for (word, tag) in tagged_words:
        if tag.startswith('N'): numNoun += 1
        if tag.startswith('V'): numVerb += 1
        if tag.startswith('J'): numAdj += 1
        if tag.startswith('R'): numAdverb += 1
    features['nouns'] = numNoun
    features['verbs'] = numVerb
    features['adjectives'] = numAdj
    features['adverbs'] = numAdverb
    return features

In [42]:
POS_featuresets = [(POS_features(d, word_features), c) for (d, c) in documents]
print(len(POS_featuresets[0][0].keys()))

1504


In [43]:
print(documents[0])
print('num nouns', POS_featuresets[0][0]['nouns'])
print('num verbs', POS_featuresets[0][0]['verbs'])
print('num adjectives', POS_featuresets[0][0]['adjectives'])
print('num adverbs', POS_featuresets[0][0]['adverbs'])

(['beautiful', ',', 'cold', ',', 'oddly', 'colorful', 'and', 'just', 'plain', 'otherworldly', ',', 'a', 'freaky', 'bit', 'of', 'art', "that's", 'there', 'to', 'scare', 'while', 'we', 'delight', 'in', 'the', 'images', '.'], 'pos')
num nouns 6
num verbs 3
num adjectives 2
num adverbs 4


In [44]:
train_set3, test_set3 = POS_featuresets[1000:], POS_featuresets[:1000]
classifier3 = nltk.NaiveBayesClassifier.train(train_set3)
nltk.classify.accuracy(classifier3, test_set3)

0.747

In [45]:
def cross_validation_accuracy(num_folds, featuresets):
    subset_size = int(len(featuresets)/num_folds)
    print('Each fold size:', subset_size)
    accuracy_list = []
    # iterate over the folds
    for i in range(num_folds):
        test_this_round = featuresets[(i*subset_size):][:subset_size]
        train_this_round = featuresets[:(i*subset_size)] + featuresets[((i+1)*subset_size):]
        # train using train_this_round
        classifier4 = nltk.NaiveBayesClassifier.train(train_this_round)
        # evaluate against test_this_round and save accuracy
        accuracy_this_round = nltk.classify.accuracy(classifier4, test_this_round)
        print (i, accuracy_this_round)
        accuracy_list.append(accuracy_this_round)
    # find mean accuracy over all rounds
    print ('mean accuracy', sum(accuracy_list) / num_folds)

In [47]:
cross_validation_accuracy(10, featuresets)

Each fold size: 1066
0 0.7467166979362101
1 0.7420262664165104
2 0.7317073170731707
3 0.7091932457786116
4 0.7485928705440901
5 0.7410881801125704
6 0.7439024390243902
7 0.7307692307692307
8 0.7476547842401501
9 0.7223264540337712
mean accuracy 0.7363977485928705


In [48]:
cross_validation_accuracy(5, featuresets)

Each fold size: 2132
0 0.7476547842401501
1 0.7166979362101313
2 0.7382739212007504
3 0.7363977485928705
4 0.7415572232645403
mean accuracy 0.7361163227016886


In [49]:
goldlist = []
predictedlist = []
for (features, label) in test_set:
    goldlist.append(label)
    predictedlist.append(classifier.classify(features))

In [50]:
print(goldlist[:30])
print(predictedlist[:30])

['pos', 'pos', 'pos', 'neg', 'pos', 'neg', 'pos', 'neg', 'neg', 'pos', 'neg', 'neg', 'pos', 'neg', 'neg', 'pos', 'pos', 'neg', 'neg', 'pos', 'neg', 'pos', 'pos', 'neg', 'pos', 'neg', 'pos', 'pos', 'pos', 'pos']
['pos', 'pos', 'pos', 'pos', 'pos', 'neg', 'pos', 'neg', 'neg', 'pos', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'pos', 'neg', 'neg', 'pos', 'neg', 'neg', 'neg', 'pos', 'pos', 'neg', 'pos', 'pos', 'pos', 'neg']


In [51]:
cm = nltk.ConfusionMatrix(goldlist, predictedlist)
print(cm.pretty_format(sort_by_count=True, truncate=9))

    |   n   p |
    |   e   o |
    |   g   s |
----+---------+
neg |<377>131 |
pos | 120<372>|
----+---------+
(row = reference; col = test)



In [52]:
def eval_measures(gold, predicted):
    # get a list of labels
    labels = list(set(gold))
    # these lists have values for each label 
    recall_list = []
    precision_list = []
    F1_list = []
    for lab in labels:
        # for each label, compare gold and predicted lists and compute values
        TP = FP = FN = TN = 0
        for i, val in enumerate(gold):
            if val == lab and predicted[i] == lab:  TP += 1
            if val == lab and predicted[i] != lab:  FN += 1
            if val != lab and predicted[i] == lab:  FP += 1
            if val != lab and predicted[i] != lab:  TN += 1
        # use these to compute recall, precision, F1
        recall = TP / (TP + FP)
        precision = TP / (TP + FN)
        recall_list.append(recall)
        precision_list.append(precision)
        F1_list.append( 2 * (recall * precision) / (recall + precision))

    # the evaluation measures in a table with one row per label
    print('\tPrecision\tRecall\t\tF1')
    # print measures for each label
    for i, lab in enumerate(labels):
        print(lab, '\t', "{:10.3f}".format(precision_list[i]), \
          "{:10.3f}".format(recall_list[i]), "{:10.3f}".format(F1_list[i]))

In [53]:
eval_measures(goldlist, predictedlist)

	Precision	Recall		F1
pos 	      0.756      0.740      0.748
neg 	      0.742      0.759      0.750


In [54]:
cross_validation_accuracy(10, bigram_featuresets)

Each fold size: 1066
0 0.7467166979362101
1 0.7420262664165104
2 0.7307692307692307
3 0.7091932457786116
4 0.7485928705440901
5 0.7410881801125704
6 0.7439024390243902
7 0.7307692307692307
8 0.7476547842401501
9 0.7223264540337712
mean accuracy 0.7363039399624766


In [55]:
cross_validation_accuracy(10, POS_featuresets)

Each fold size: 1066
0 0.7476547842401501
1 0.7410881801125704
2 0.7204502814258912
3 0.7082551594746717
4 0.7532833020637899
5 0.7439024390243902
6 0.7514071294559099
7 0.726078799249531
8 0.7570356472795498
9 0.7288930581613509
mean accuracy 0.7378048780487806


In [59]:
random.shuffle(bigram_featuresets)
random.shuffle(POS_featuresets)

In [60]:
cross_validation_accuracy(10, bigram_featuresets)

Each fold size: 1066
0 0.7213883677298312
1 0.7607879924953096
2 0.7298311444652908
3 0.7439024390243902
4 0.7298311444652908
5 0.726078799249531
6 0.7429643527204502
7 0.7485928705440901
8 0.7467166979362101
9 0.7420262664165104
mean accuracy 0.7392120075046904


In [61]:
cross_validation_accuracy(10, POS_featuresets)

Each fold size: 1066
0 0.7354596622889306
1 0.7138836772983115
2 0.7288930581613509
3 0.7148217636022514
4 0.7429643527204502
5 0.7401500938086304
6 0.7279549718574109
7 0.7439024390243902
8 0.7345215759849906
9 0.7617260787992496
mean accuracy 0.7344277673545967
