In this assignment, you'll see an example in which some text does not tag well, most likely because the training data did not have many examples of the target sentence structure.  You'll see the effects of adding a few sentences of training data with the missing sentence structure on the accuracy of the tagger.

In [4]:
import nltk, re
from nltk.corpus import brown
from nltk import word_tokenize

First, create datasets and train an ngram backoff tagger as before, using the brown corpus as the training set.

In [5]:
def create_data_sets(sentences):
    size = int(len(sentences) * 0.9)
    train_sents = sentences[:size]
    test_sents = sentences[size:]
    return train_sents, test_sents

def build_backoff_tagger (train_sents):
    t0 = nltk.DefaultTagger('NN')
    t1 = nltk.UnigramTagger(train_sents, backoff=t0)
    t2 = nltk.BigramTagger(train_sents, backoff=t1)
    return t2

brown_tagged_sents = brown.tagged_sents(categories=['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies',
'humor', 'learned', 'lore', 'mystery', 'religion', 'reviews', 'romance',
'science_fiction'])

train_sents, test_sents = create_data_sets(brown_tagged_sents)

ngram_tagger = build_backoff_tagger(train_sents)
print ("%0.3f" % ngram_tagger.evaluate(test_sents))



0.911


Next, read in a file of recipes and tokenize it.

In [6]:
with open('cookbooks.txt', 'r') as text_file:
    cookbooks_corpus = text_file.read()

def tokenize_text(corpus):
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sents = sent_tokenizer.tokenize(corpus) # Split text into sentences
    
    return [nltk.word_tokenize(word) for word in raw_sents]

cookbook_sents = tokenize_text(cookbooks_corpus)


Now, in order to see the sentences where errors are occuring, the code below finds sentences that begin with imperatives and prints them out, along with their assigned parts of speech.

In [4]:
for sent in cookbook_sents:
    if sent[0] in ["Wash", "Stir", "Moisten", "Drain", "Cook", "Pour", "Chop", "Slice", "Season", "Mix", "Fry", "Bake", "Roast", "Wisk"]:
        for item in ngram_tagger.tag(sent):
            print(item) 
        print()

('Wash', 'NN')
('a', 'AT')
('quarter', 'NN')
('of', 'IN')
('a', 'AT')
('pound', 'NN')
('of', 'IN')
('best', 'JJT')
('pearl', 'NN')
('sago', 'NN')
('thoroughly', 'RB')
(',', ',')
('then', 'RB')
('stew', 'NN')
('it', 'PPS')
('quite', 'QL')
('tender', 'JJ')
('and', 'CC')
('very', 'QL')
('View', 'NN')
('page', 'NN')
('[', '(')
('32', 'CD')
(']', ',')
('thick', 'JJ')
('in', 'IN')
('water', 'NN')
('or', 'CC')
('thick', 'JJ')
('broth', 'NN')
(';', '.')
('(', '(')
('it', 'PPS')
('will', 'MD')
('require', 'VB')
('nearly', 'QL')
('or', 'CC')
('quite', 'QL')
('a', 'AT')
('quart', 'NN')
('of', 'IN')
('liquid', 'JJ')
(',', ',')
('which', 'WDT')
('should', 'MD')
('be', 'BE')
('poured', 'VBN')
('to', 'TO')
('it', 'PPO')
('cold', 'JJ')
('and', 'CC')
('heated', 'VBN')
('slowly', 'RB')
(';', '.')
(')', ')')
('then', 'RB')
('mix', 'VB')
('gradually', 'RB')
('with', 'IN')
('it', 'PPO')
('a', 'AT')
('pint', 'NN')
('of', 'IN')
('good', 'JJ')
('boiling', 'VBG')
('cream', 'NN')
('or', 'CC')
('milk', 'NN')
(',

Notice that most of the initial words are incorrectly tagged as nouns rather than verbs.  How can we fix this?  One way is to label a few rather generic sentences with the structure we are interested in, add them to the start of the training data, and then retrain the tagger.

In [5]:
cooking_action_sents = [[('Strain', 'VB'), ('it', 'PPS'), ('well', 'RB'), ('.', '.')],
                        [('Mix', 'VB'), ('them', 'PPS'), ('well', 'RB'), ('.', '.')],
                        [('Season', 'VB'), ('them', 'PPS'), ('with', 'IN'), ('pepper', 'NN'), ('.', '.')], 
                        [('Wash', 'VB'), ('it', 'PPS'), ('well', 'RB'), ('.', '.')],
                        [('Chop', 'VB'), ('the', 'AT'), ('greens', 'NNS'), ('.', '.')],
                        [('Slice', 'VB'), ('it', 'PPS'), ('well', 'RB'), ('.', '.')],
                        [('Bake', 'VB'), ('the', 'AT'), ('cake', 'NN'), ('.', '.')],
                        [('Pour', 'VB'), ('into', 'IN'), ('a', 'AT'), ('mold', 'NN'), ('.', '.')],
                        [('Stir', 'VB'), ('the', 'AT'), ('mixture', 'NN'), ('.', '.')],
                        [('Moisten', 'VB'), ('the', 'AT'), ('grains', 'NNS'), ('.', '.')],
                        [('Cook', 'VB'), ('the', 'AT'), ('duck', 'NN'), ('.', '.')],
                        [('Drain', 'VB'), ('for', 'IN'), ('one', 'CD'), ('day', 'NN'), ('.', '.')]]


all_tagged_sents = cooking_action_sents + brown_tagged_sents
train_sents, test_sents = create_data_sets(all_tagged_sents)

ngram_tagger_all_sents = build_backoff_tagger(train_sents)
print ("%0.3f" % ngram_tagger_all_sents.evaluate(test_sents))


0.911


In [33]:
for sent in cookbook_sents:
     if sent[0] in ["Wash", "Stir", "Moisten", "Drain", "Cook", "Pour", "Chop", "Slice", "Season", "Mix", "Fry", "Bake", "Roast", "Wisk"]:
            for item in ngram_tagger_all_sents.tag(sent):
                print(item)
            print()

NameError: name 'ngram_tagger_all_sents' is not defined

How well is this working? 

### This American Life Tagger

In [7]:
import re

In [8]:
with open('../tal_stories/tal_text.txt') as tal_text:
    tal = tal_text.read()

# remove dialog tags
tal = re.sub("(<[^>]+>)", '', tal)
tal_sents = tokenize_text(tal)

nltk.help.brown_tagset()

(: opening parenthesis
    (
): closing parenthesis
    )
*: negator
    not n't
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ? ; ! :
:: colon
    :
ABL: determiner/pronoun, pre-qualifier
    quite such rather
ABN: determiner/pronoun, pre-quantifier
    all half many nary
ABX: determiner/pronoun, double conjunction or pre-quantifier
    both
AP: determiner/pronoun, post-determiner
    many other next more last former little several enough most least only
    very few fewer past same Last latter less single plenty 'nough lesser
    certain various manye next-to-last particular final previous present
    nuf
AP$: determiner/pronoun, post-determiner, genitive
    other's
AP+AP: determiner/pronoun, post-determiner, hyphenated pair
    many-much
AT: article
    the an no a every th' ever' ye
BE: verb 'to be', infinitive or imperative
    be
BED: verb 'to be', past tense, 2nd person singular or all persons plural
    were
BED*: verb 'to be', past tense, 2nd person singular or 

### Lots of Mistakes on Proper Nouns

In [35]:
mistakes = []
for sent in tal_sents:
    for item in sent:
        if item in ["Zoe", "Chace", "Planet", "Money", "Podcast", "Corolla", "Myhrvold"]: mistakes.append(ngram_tagger.tag(sent))

mistakes

[[('So', 'RB'),
  ('I', 'PPSS'),
  ('had', 'HVD'),
  ('a', 'AT'),
  ('special', 'JJ'),
  ('interest', 'NN'),
  ('in', 'IN'),
  ('some', 'DTI'),
  ('interviews', 'NNS'),
  ('that', 'WPS'),
  ('one', 'CD'),
  ('of', 'IN'),
  ('my', 'PP$'),
  ('colleagues', 'NNS'),
  (',', ','),
  ('Zoe', 'NN'),
  ('Chace', 'NN'),
  (',', ','),
  ('did', 'DOD'),
  ('recently', 'RB'),
  ('.', '.')],
 [('So', 'RB'),
  ('I', 'PPSS'),
  ('had', 'HVD'),
  ('a', 'AT'),
  ('special', 'JJ'),
  ('interest', 'NN'),
  ('in', 'IN'),
  ('some', 'DTI'),
  ('interviews', 'NNS'),
  ('that', 'WPS'),
  ('one', 'CD'),
  ('of', 'IN'),
  ('my', 'PP$'),
  ('colleagues', 'NNS'),
  (',', ','),
  ('Zoe', 'NN'),
  ('Chace', 'NN'),
  (',', ','),
  ('did', 'DOD'),
  ('recently', 'RB'),
  ('.', '.')],
 [('She', 'PPS'),
  ("'s", 'NN'),
  ('one', 'CD'),
  ('of', 'IN'),
  ('the', 'AT'),
  ('reporters', 'NNS'),
  ('for', 'IN'),
  ('Planet', 'NN'),
  ('Money', 'NN'),
  (',', ','),
  ('which', 'WDT'),
  ('itself', 'PPL'),
  ('is', 'BEZ'),


In [45]:
training_fix = [[('So', 'RB'),  ('I', 'PPSS'),  ('had', 'HVD'),  ('a', 'AT'),  ('special', 'JJ'),  ('interest', 'NN')
                 ,  ('in', 'IN'),  ('some', 'DTI'),  ('interviews', 'NNS'),  ('that', 'WPS'),  ('one', 'CD')
                 ,  ('of', 'IN'),  ('my', 'PP$'),  ('colleagues', 'NNS'),  (',', ','),  ('Zoe', 'NP'),  ('Chace', 'NP')
                 ,  (',', ','),  ('did', 'DOD'),  ('recently', 'RB'),  ('.', '.')],
                
                [('Alex', 'NP'),  ('is', 'BEZ'),  ('part', 'NN'),  ('of', 'IN'),  ('our', 'PP$'),  ('Planet', 'NP'),
                   ('Money', 'NP'),  ('team', 'NN'),  ('.', '.')],
                [('Zoe', 'NP'), ('was', 'BEDZ'), ('interviewing', 'VBG'), ('these', 'DTS'), ('two', 'CD')
                 , ('guys', 'NNS'), (',', ','), ('Jim', 'NP'), ('Logan', 'NP'), ('and', 'CC'), ('Richard', 'NP')
                 , ('Baker', 'NP'), (',', ','), ('who', 'WPS'), ('say', 'VB'), ('that', 'CS'), ('their', 'PP$')
                 , ('company', 'NN'), (',', ','), ('Personal', 'NP'), ('Audio', 'NP'), (',', ','), ('holds', 'VBZ')
                 , ('the', 'AT'), ('patent', 'NN'), ('on', 'IN'), ('podcasts', 'NN'), ('.', '.')],
                [('This', 'DT'), ('is', 'BEZ'), ('Myhrvold', 'NP'), ('on', 'IN'), ('the', 'AT'), ('television', 'NN')
                 , ('show', 'NN'), ('The', 'AT-TL'), ('Colbert', 'NN'), ('Report', 'NN'), (',', ','), ('talking', 'VBG')
                 , ('about', 'IN'), ('another', 'DT'), ('one', 'CD'), ('of', 'IN'), ('his', 'PP$'), ('ventures', 'NNS')
                 , ('--', '--'), ('an', 'AT'), ('opus', 'NN'), ('on', 'IN'), ('the', 'AT'), ('science', 'NN'), ('of', 'IN')
                 , ('cooking', 'VBG'), (',', ','), ('which', 'WDT'), ('teaches', 'VBZ'), ('you', 'PPO'), ('how', 'WRB')
                 , ('to', 'TO'), ('do', 'DO'), ('things', 'NNS'), ('like', 'CS'), ('make', 'VB'), ('ice', 'NN'), ('cream', 'NN')
                 , ('with', 'IN'), ('liquid', 'NN'), ('nitrogen', 'NN'), ('.', '.')],
                
] * 100

brown_tagged_sents = brown.tagged_sents(categories=['adventure', 'editorial', 'fiction', 'romance','science_fiction'])

all_tagged_sents = training_fix + brown_tagged_sents
train_sents, test_sents = create_data_sets(all_tagged_sents)
ngram_tagger_all_sents = build_backoff_tagger(train_sents)

for sent in tal_sents:
    for item in sent:
        if item in ["Zoe", "Chace", "Planet", "Money", "Podcast", "Corolla", "Myhrvold"]: print(ngram_tagger_all_sents.tag(sent))

[('So', 'RB'), ('I', 'PPSS'), ('had', 'HVD'), ('a', 'AT'), ('special', 'JJ'), ('interest', 'NN'), ('in', 'IN'), ('some', 'DTI'), ('interviews', 'NNS'), ('that', 'WPS'), ('one', 'CD'), ('of', 'IN'), ('my', 'PP$'), ('colleagues', 'NNS'), (',', ','), ('Zoe', 'NP'), ('Chace', 'NP'), (',', ','), ('did', 'DOD'), ('recently', 'RB'), ('.', '.')]
[('So', 'RB'), ('I', 'PPSS'), ('had', 'HVD'), ('a', 'AT'), ('special', 'JJ'), ('interest', 'NN'), ('in', 'IN'), ('some', 'DTI'), ('interviews', 'NNS'), ('that', 'WPS'), ('one', 'CD'), ('of', 'IN'), ('my', 'PP$'), ('colleagues', 'NNS'), (',', ','), ('Zoe', 'NP'), ('Chace', 'NP'), (',', ','), ('did', 'DOD'), ('recently', 'RB'), ('.', '.')]
[('She', 'PPS'), ("'s", 'NN'), ('one', 'CD'), ('of', 'IN'), ('the', 'AT'), ('reporters', 'NNS'), ('for', 'IN'), ('Planet', 'NP'), ('Money', 'NP'), (',', ','), ('which', 'WDT'), ('itself', 'PPL'), ('is', 'BEZ'), ('a', 'AT'), ('podcast', 'NN'), ('.', '.')]
[('She', 'PPS'), ("'s", 'NN'), ('one', 'CD'), ('of', 'IN'), ('the