In this assignment, you'll see an example in which some text does not tag well, most likely because the training data did not have many examples of the target sentence structure.  You'll see the effects of adding a few sentences of training data with the missing sentence structure on the accuracy of the tagger.

In [4]:
import nltk, re
from nltk.corpus import brown
from nltk import word_tokenize

First, create datasets and train an ngram backoff tagger as before, using the brown corpus as the training set.

In [5]:
def create_data_sets(sentences):
    size = int(len(sentences) * 0.9)
    train_sents = sentences[:size]
    test_sents = sentences[size:]
    return train_sents, test_sents

def build_backoff_tagger (train_sents):
    t0 = nltk.DefaultTagger('NN')
    t1 = nltk.UnigramTagger(train_sents, backoff=t0)
    t2 = nltk.BigramTagger(train_sents, backoff=t1)
    return t2

brown_tagged_sents = brown.tagged_sents(categories=['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies',
'humor', 'learned', 'lore', 'mystery', 'religion', 'reviews', 'romance',
'science_fiction'])

train_sents, test_sents = create_data_sets(brown_tagged_sents)

ngram_tagger = build_backoff_tagger(train_sents)
print ("%0.3f" % ngram_tagger.evaluate(test_sents))



0.912


Next, read in a file of recipes and tokenize it.

In [8]:
with open('cookbooks.txt', 'r') as text_file:
    cookbooks_corpus = text_file.read()

def tokenize_text(corpus):
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sents = sent_tokenizer.tokenize(corpus) # Split text into sentences
    
    return [nltk.word_tokenize(word) for word in raw_sents]

cookbook_sents = tokenize_text(cookbooks_corpus)


Now, in order to see the sentences where errors are occuring, the code below finds sentences that begin with imperatives and prints them out, along with their assigned parts of speech.

In [9]:
for sent in cookbook_sents:
    if sent[0] in ["Wash", "Stir", "Moisten", "Drain", "Cook", "Pour", "Chop", "Slice", "Season", "Mix", "Fry", "Bake", "Roast", "Wisk"]:
        for item in ngram_tagger.tag(sent):
            print(item) 
        print()

('Wash', 'NN')
('a', 'AT')
('quarter', 'NN')
('of', 'IN')
('a', 'AT')
('pound', 'NN')
('of', 'IN')
('best', 'JJT')
('pearl', 'NN')
('sago', 'NN')
('thoroughly', 'RB')
(',', ',')
('then', 'RB')
('stew', 'NN')
('it', 'PPS')
('quite', 'QL')
('tender', 'JJ')
('and', 'CC')
('very', 'QL')
('View', 'NN')
('page', 'NN')
('[', ',')
('32', 'CD')
(']', ',')
('thick', 'JJ')
('in', 'IN')
('water', 'NN')
('or', 'CC')
('thick', 'JJ')
('broth', 'NN')
(';', '.')
('(', '(')
('it', 'PPS')
('will', 'MD')
('require', 'VB')
('nearly', 'QL')
('or', 'CC')
('quite', 'QL')
('a', 'AT')
('quart', 'NN')
('of', 'IN')
('liquid', 'NN')
(',', ',')
('which', 'WDT')
('should', 'MD')
('be', 'BE')
('poured', 'VBN')
('to', 'TO')
('it', 'PPO')
('cold', 'JJ')
('and', 'CC')
('heated', 'VBN')
('slowly', 'RB')
(';', '.')
(')', ')')
('then', 'RB')
('mix', 'VB')
('gradually', 'RB')
('with', 'IN')
('it', 'PPO')
('a', 'AT')
('pint', 'NN')
('of', 'IN')
('good', 'JJ')
('boiling', 'VBG')
('cream', 'NN')
('or', 'CC')
('milk', 'NN')
(',

Notice that most of the initial words are incorrectly tagged as nouns rather than verbs.  How can we fix this?  One way is to label a few rather generic sentences with the structure we are interested in, add them to the start of the training data, and then retrain the tagger.

In [10]:
cooking_action_sents = [[('Strain', 'VB'), ('it', 'PPS'), ('well', 'RB'), ('.', '.')],
                        [('Mix', 'VB'), ('them', 'PPS'), ('well', 'RB'), ('.', '.')],
                        [('Season', 'VB'), ('them', 'PPS'), ('with', 'IN'), ('pepper', 'NN'), ('.', '.')], 
                        [('Wash', 'VB'), ('it', 'PPS'), ('well', 'RB'), ('.', '.')],
                        [('Chop', 'VB'), ('the', 'AT'), ('greens', 'NNS'), ('.', '.')],
                        [('Slice', 'VB'), ('it', 'PPS'), ('well', 'RB'), ('.', '.')],
                        [('Bake', 'VB'), ('the', 'AT'), ('cake', 'NN'), ('.', '.')],
                        [('Pour', 'VB'), ('into', 'IN'), ('a', 'AT'), ('mold', 'NN'), ('.', '.')],
                        [('Stir', 'VB'), ('the', 'AT'), ('mixture', 'NN'), ('.', '.')],
                        [('Moisten', 'VB'), ('the', 'AT'), ('grains', 'NNS'), ('.', '.')],
                        [('Cook', 'VB'), ('the', 'AT'), ('duck', 'NN'), ('.', '.')],
                        [('Drain', 'VB'), ('for', 'IN'), ('one', 'CD'), ('day', 'NN'), ('.', '.')]]


all_tagged_sents = cooking_action_sents + brown_tagged_sents
train_sents, test_sents = create_data_sets(all_tagged_sents)

ngram_tagger_all_sents = build_backoff_tagger(train_sents)
print ("%0.3f" % ngram_tagger_all_sents.evaluate(test_sents))


0.912


In [30]:
for sent in cookbook_sents:
     if sent[0] in ["Wash", "Stir", "Moisten", "Drain", "Cook", "Pour", "Chop", "Slice", "Season", "Mix", "Fry", "Bake", "Roast", "Wisk"]:
            for item in ngram_tagger_all_sents.tag(sent):
                print(item)
            print()

('Wash', 'VB')
('a', 'AT')
('quarter', 'NN')
('of', 'IN')
('a', 'AT')
('pound', 'NN')
('of', 'IN')
('best', 'JJT')
('pearl', 'NN')
('sago', 'NN')
('thoroughly', 'RB')
(',', ',')
('then', 'RB')
('stew', 'NN')
('it', 'PPS')
('quite', 'QL')
('tender', 'JJ')
('and', 'CC')
('very', 'QL')
('View', 'NN')
('page', 'NN')
('[', ',')
('32', 'CD')
(']', ',')
('thick', 'JJ')
('in', 'IN')
('water', 'NN')
('or', 'CC')
('thick', 'JJ')
('broth', 'NN')
(';', '.')
('(', '(')
('it', 'PPS')
('will', 'MD')
('require', 'VB')
('nearly', 'QL')
('or', 'CC')
('quite', 'QL')
('a', 'AT')
('quart', 'NN')
('of', 'IN')
('liquid', 'NN')
(',', ',')
('which', 'WDT')
('should', 'MD')
('be', 'BE')
('poured', 'VBN')
('to', 'TO')
('it', 'PPO')
('cold', 'JJ')
('and', 'CC')
('heated', 'VBN')
('slowly', 'RB')
(';', '.')
(')', ')')
('then', 'RB')
('mix', 'VB')
('gradually', 'RB')
('with', 'IN')
('it', 'PPO')
('a', 'AT')
('pint', 'NN')
('of', 'IN')
('good', 'JJ')
('boiling', 'VBG')
('cream', 'NN')
('or', 'CC')
('milk', 'NN')
(',

How well is this working? 

### This American Life Tagger

In [None]:
import re

In [31]:
with open('../tal_stories/tal_text.txt') as tal_text:
    tal = tal_text.read()

# remove dialog tags
tal = re.sub("(<[^>]+>)", '', tal)
tal_sents = tokenize_text(tal)

nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [32]:
for sent in tal_sents[1000:1040]:
    for item in ngram_tagger.tag(sent):
        print(item) 
    print()

('We', 'PPSS')
('waited', 'VBD')
('.', '.')

('The', 'AT')
('concert', 'NN')
('was', 'BEDZ')
('going', 'VBG')
('to', 'TO')
('start', 'VB')
('soon', 'RB')
(',', ',')
('and', 'CC')
('I', 'PPSS')
('did', 'DOD')
("n't", 'NN')
('have', 'HV')
('any', 'DTI')
('proof', 'NN')
('that', 'CS')
('Julian', 'NP')
('was', 'BEDZ')
('going', 'VBG')
('to', 'TO')
('be', 'BE')
('there', 'RB')
('except', 'IN')
('a', 'AT')
('Facebook', 'NN')
('post', 'NN')
('from', 'IN')
('a', 'AT')
('few', 'AP')
('days', 'NNS')
('before', 'IN')
('.', '.')

('And', 'CC')
('he', 'PPS')
('had', 'HVD')
('been', 'BEN')
('out', 'RP')
('since', 'IN')
('3:31', 'NN')
('AM', 'NN')
('last', 'AP')
('night', 'NN')
(',', ',')
('so', 'CS')
('I', 'PPSS')
('thought', 'VBD')
('maybe', 'RB')
('he', 'PPS')
('was', 'BEDZ')
('tired', 'VBN')
('?', '.')

('Then', 'RB')
(',', ',')
('he', 'PPS')
('walked', 'VBD')
('down', 'RP')
('the', 'AT')
('stairs', 'NNS')
('.', '.')

('I', 'PPSS')
('recognized', 'VBD')
('him', 'PPO')
('immediately', 'RB')
('.', 