In [1]:
import nltk
from collections import defaultdict
import re
from nltk.corpus import *
import random

__☼ Write a tag pattern to match noun phrases containing plural head nouns, e.g. "many/JJ researchers/NNS", "two/CD weeks/NNS", "both/DT new/JJ positions/NNS". Try to do this by generalizing the tag pattern that handled singular noun phrases.__

In [60]:
nltk.app.chunkparser()

In [109]:
grammar = r"""
  NP:   {(<PRP\$|POS>)<JJS?|VBG|CD|VBN>?<NN(S|P)?>+}
        {<RBR>?<DT|PP\$|PRP\$>?<RB|JJR>?<JJ|VBG|CD|VBN>*<NN(S|P)?>+}
        {<PRP>}
"""
cp = nltk.RegexpParser(grammar)

In [5]:
sentences = [[("many", "JJ"), ("researchers", "NNS")], [("two", "CD"), ("weeks", "NNS")], [("both", "DT"), ("new", "JJ"), ("positions", "NNS")]]
for sent in sentences:
    print(cp.parse(sent))

(S (NP many/JJ researchers/NNS))
(S (NP two/CD weeks/NNS))
(S (NP both/DT new/JJ positions/NNS))


__☼ Pick one of the three chunk types in the CoNLL corpus. Inspect the CoNLL corpus and try to observe any patterns in the POS tag sequences that make up this kind of chunk. Develop a simple chunker using the regular expression chunker nltk.RegexpParser. Discuss any tag sequences that are difficult to chunk reliably.__

In [24]:
conll_train_NP = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
conll_test_NP = conll2000.chunked_sents('test.txt', chunk_types=['NP'])

In [38]:
# let us take a look at all NP in first 30 tree
for tree in conll_train_NP[:30]:
    for num in range(len(tree) - 1):
        try:
            tree[num].label()
        except AttributeError:
            pass
        else:
            print(tree[num])

(NP Confidence/NN)
(NP the/DT pound/NN)
(NP another/DT sharp/JJ dive/NN)
(NP trade/NN figures/NNS)
(NP September/NNP)
(NP release/NN)
(NP tomorrow/NN)
(NP a/DT substantial/JJ improvement/NN)
(NP July/NNP and/CC August/NNP)
(NP 's/POS near-record/JJ deficits/NNS)
(NP the/DT Exchequer/NNP)
(NP Nigel/NNP Lawson/NNP)
(NP 's/POS restated/VBN commitment/NN)
(NP a/DT firm/NN monetary/JJ policy/NN)
(NP a/DT freefall/NN)
(NP sterling/NN)
(NP the/DT past/JJ week/NN)
(NP analysts/NNS)
(NP underlying/VBG support/NN)
(NP sterling/NN)
(NP the/DT chancellor/NN)
(NP 's/POS failure/NN)
(NP any/DT new/JJ policy/NN measures/NNS)
(NP his/PRP$ Mansion/NNP House/NNP speech/NN)
(NP last/JJ Thursday/NNP)
(NP This/DT)
(NP the/DT risk/NN)
(NP the/DT government/NN)
(NP base/NN rates/NNS)
(NP 16/CD %/NN)
(NP their/PRP$ current/JJ 15/CD %/NN level/NN)
(NP the/DT pound/NN)
(NP economists/NNS)
(NP foreign/JJ exchange/NN market/NN analysts/NNS)
(NP The/DT risks/NNS)
(NP sterling/NN)
(NP a/DT bad/JJ trade/NN figure/NN

In [110]:
print(cp.evaluate(conll_test_NP))

ChunkParse score:
    IOB Accuracy:  88.3%%
    Precision:     82.8%%
    Recall:        76.9%%
    F-Measure:     79.7%%


__☼ An early definition of chunk was the material that occurs between chinks. Develop a chunker that starts by putting the whole sentence in a single chunk, and then does the rest of its work solely by chinking. Determine which tags (or tag sequences) are most likely to make up chinks with the help of your own utility program. Compare the performance and simplicity of this approach relative to a chunker based entirely on chunk rules.__

In [57]:
grammar_chink = r"""
  NP:
    {<.*>+}          
    }<VBG><VBN>{
    }<RB>+<VB(P|Z|D|G|N)?|IN|TO|MD>{
    }<VB(P|Z|D|G|N)?><JJ>{
    }<IN|TO|VB(P|Z|D)?|MD|CC|.|,|WDT|``|''|WRB>+{
    <NN(P|S)>}{<POS>
    <CD>}{<DT>
    
  """
cp_chink = nltk.RegexpParser(grammar_chink)

In [58]:
print(cp_chink.evaluate(conll_test_NP))

ChunkParse score:
    IOB Accuracy:  86.4%%
    Precision:     68.2%%
    Recall:        72.9%%
    F-Measure:     70.5%%


__◑ Write a tag pattern to cover noun phrases that contain gerunds, e.g. "the/DT receiving/VBG end/NN", "assistant/NN managing/VBG editor/NN". Add these patterns to the grammar, one per line. Test your work using some tagged sentences of your own devising.__

In [59]:
# already done, my RegexP chunker above accounts for possible VBGs

__◑ Write one or more tag patterns to handle coordinated noun phrases, e.g. "July/NNP and/CC August/NNP", "all/DT your/PRP$ managers/NNS and/CC supervisors/NNS", "company/NN courts/NNS and/CC adjudicators/NNS".__

In [61]:
grammar_2 = r"""
  NP:   {(<PRP\$|POS>)<JJS?|VBG|CD|VBN>?<NN(S|P)?>+}
        {<RBR>?<DT|PP\$|PRP\$>?<RB|JJR>?<JJ|VBG|CD|VBN>*<NN(S|P)?>+}
        <NN(P|S)?>{}<CC><NN(P|S)?>
"""
cp_2 = nltk.RegexpParser(grammar)

In [64]:
print(cp_2.evaluate(conll_test_NP))

ChunkParse score:
    IOB Accuracy:  86.6%%
    Precision:     81.6%%
    Recall:        70.4%%
    F-Measure:     75.6%%


__◑ Carry out the following evaluation tasks for any of the chunkers you have developed earlier. (Note that most chunking corpora contain some internal inconsistencies, such that any reasonable rule-based approach will produce errors.)__

Evaluate your chunker on 100 sentences from a chunked corpus, and report the precision, recall and F-measure.

Use the chunkscore.missed() and chunkscore.incorrect() methods to identify the errors made by your chunker. Discuss.

Compare the performance of your chunker to the baseline chunker discussed in the evaluation section of this chapter.

In [117]:
# Number 1 was already done above
chunkscore = nltk.ChunkScore()
unchunked = conll2000.chunked_sents('test.txt', chunk_types=[''])

In [118]:
chunkscore.score(conll_test_NP, ([cp.parse(sent) for sent in unchunked]))
print("Recall", chunkscore.recall())
print("Precision", chunkscore.precision())
print("F-score", chunkscore.f_measure())

Recall 0.2823061630218688
Precision 0.2823061630218688
F-score 0.2823061630218688


In [119]:
len(chunkscore.incorrect()), len(chunkscore.missed())

(1444, 1444)

In [120]:
print(cp.parse(chunkscore.incorrect()[0]))

(S
  In/IN
  (NP the/DT 1988/CD third/JJ quarter/NN)
  ,/,
  (NP the/DT forest-products/NNS company/NN)
  reported/VBD
  (NP profit/NN)
  of/IN
  $/$
  144.9/CD
  million/CD
  ,/,
  or/CC
  (NP 69/CD cents/NNS)
  (NP a/DT share/NN)
  ./.)


In [121]:
print(cp.parse(chunkscore.missed()[0]))

(S
  Without/IN
  (NP the/DT gain/NN)
  ,/,
  (NP operating/VBG profit/NN)
  was/VBD
  (NP $/$ 64/CD million/CD)
  ,/,
  or/CC
  (NP 71/CD cents/NNS)
  (NP a/DT share/NN)
  ./.)


__◑ Develop a chunker for one of the chunk types in the CoNLL corpus using a regular-expression based chunk grammar RegexpChunk. Use any combination of rules for chunking, chinking, merging or splitting.__

In [69]:
grammar = r"""
  NP:   {(<PRP\$|POS>)<JJS?|VBG|CD|VBN>?<NN(S|P)?>+}
        {<RBR>?<DT|PP\$|PRP\$>?<RB|JJR>?<JJ|VBG|CD|VBN>*<NN(S|P)?>+}
        {<PRP>}
"""

__◑ Sometimes a word is incorrectly tagged, e.g. the head noun in "12/CD or/CC so/RB cases/VBZ". Instead of requiring manual correction of tagger output, good chunkers are able to work with the erroneous output of taggers. Look for other examples of correctly chunked noun phrases with incorrect tags.__

In [141]:
for tree in conll_train_NP:
    for num in range(len(tree) - 1):
        try:
            tree[num].label()
        except AttributeError:
            pass
        else:
            for (w, tag) in (tree[num]):
                if tag == "VBZ":
                    print(tree[num])

(NP a/DT couple/NN of/IN dozen/NN cases/VBZ)
(NP only/RB about/IN 20/CD cases/VBZ)
(NP 50/CD to/TO 75/CD cases/VBZ)
(NP only/RB 12/CD or/CC so/RB cases/VBZ)
(NP closes/VBZ to/TO $/$ 15/CD billion/CD)
(NP weaker/JJR housing/NN starts/VBZ)
(NP estimates/VBZ)
(NP what/WP changes/VBZ)
(NP operating/VBG results/VBZ)
(NP offers/VBZ)
(NP 's/VBZ)
(NP the/DT production/NN runs/VBZ)
(NP Newsprint/NN and/CC postage/NN prices/VBZ)
(NP other/JJ commodity/NN markets/VBZ)
(NP employees/VBZ)
(NP Sell/VB stops/VBZ)
(NP buy/VB stops/VBZ)
(NP commissions/VBZ and/CC fees/NNS)
(NP back/RB taxes/VBZ)
(NP leads/VBZ)
(NP admits/VBZ)
(NP admits/VBZ)
(NP numbers/VBZ)
(NP nursing-home/NN stays/VBZ)
(NP longer/RB hours/VBZ)
(NP base/NN rates/VBZ)
(NP prices/VBZ)
(NP seven/CD or/CC eight/CD times/VBZ)
(NP those/DT ``/`` plain/JJ vanilla/NN ''/'' funds/VBZ)
(NP stocks/VBZ)
(NP 2,387,226/CD shares/VBZ)
(NP 1,000/CD bicycle/NN and/CC motorbike/NN tires/VBZ)
(NP calls/VBZ)
(NP rivets/VBZ)
(NP back/RB taxes/VBZ)
(NP in

__◑ The bigram chunker scores about 90% accuracy. Study its errors and try to work out why it doesn't get 100% accuracy. Experiment with trigram chunking. Are you able to improve the performance any more?__

In [146]:
class BigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
        self.tagger = nltk.BigramTagger(train_data)

    def parse(self, sentence):
        pos_tags = [pos for (word,pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
                     in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)

In [162]:
conll_train = conll2000.chunked_sents('train.txt')
conll_test = conll2000.chunked_sents('test.txt')
conll_test_unchunked = conll2000.chunked_sents('test.txt', chunk_types = [""])

bigram_chunker = BigramChunker(conll_train)
chunkscore_2 = nltk.ChunkScore()
chunkscore_2.score(conll_test, ([bigram_chunker.parse(s) for s in conll_test_unchunked]))

In [163]:
chunkscore_2.precision()

0.21520874751491054

In [155]:
print(chunkscore_2.incorrect()[0])

(S
  (NP Scott/NNP Paper/NNP Co./NNP)
  (VP said/VBD)
  (NP it/PRP)
  (VP is/VBZ abandoning/VBG)
  (NP
    a/DT
    proposed/VBN
    $/$
    650/CD
    million/CD
    tree-farming/JJ
    project/NN)
  (PP in/IN)
  (NP Indonesia/NNP)
  (PP because/IN)
  (NP it/PRP)
  no/RB
  longer/RB
  (VP expects/VBZ to/TO use/VB)
  (PP as/IN)
  (NP much/JJ eucalyptus/NN pulp/NN)
  (PP as/IN)
  (NP previously/RB anticipated/VBN)
  ./.)


__★ We saw in 5. that it is possible to establish an upper limit to tagging performance by looking for ambiguous n-grams, n-grams that are tagged in more than one possible way in the training data. Apply the same method to determine an upper bound on the performance of an n-gram chunker.__

__★ Pick one of the three chunk types in the CoNLL corpus. Write functions to do the following tasks for your chosen type:__

List all the tag sequences that occur with each instance of this chunk type.

Count the frequency of each tag sequence, and produce a ranked list in order of decreasing frequency; each line should consist of an integer (the frequency) and the tag sequence.

Inspect the high-frequency tag sequences. Use these as the basis for developing a better chunker.

In [221]:
# A dictionary where key is the index of a tree on corpus, 
# the value is another dictionary where its key is the index of NP in a tree
# and its value is the list of tags of that particular NP
tags = defaultdict(lambda : defaultdict(lambda : list()))

for idx_tree, tree in enumerate(list(conll_train_NP)):
    for num in range(len(tree) - 1):
        try:
            tree[num].label()
        except AttributeError:
            pass
        else:
            for (w, tag) in tree[num]:
                tags[idx_tree][num].append(tag)

In [222]:
print(tags[0][2])
print(conll_train_NP[0][2])

['DT', 'NN']
(NP the/DT pound/NN)


In [229]:
final_tagset = []
for tree_num, dict_second in tags.items():
    for tuple_num, tagset in dict_second.items():
        final_tagset.append(tuple(tagset))

In [231]:
tags_NP_freq = nltk.FreqDist(final_tagset)

In [232]:
tags_NP_freq.most_common(20)

[(('DT', 'NN'), 7222),
 (('PRP',), 3802),
 (('NNS',), 3278),
 (('NN',), 3242),
 (('NNP',), 3242),
 (('NNP', 'NNP'), 2639),
 (('DT', 'JJ', 'NN'), 2119),
 (('JJ', 'NNS'), 1717),
 (('DT', 'NNS'), 1173),
 (('JJ', 'NN'), 1143),
 (('NN', 'NNS'), 1012),
 (('WDT',), 930),
 (('DT', 'NN', 'NN'), 921),
 (('CD',), 866),
 (('CD', 'NN'), 827),
 (('$', 'CD', 'CD'), 823),
 (('CD', 'NNS'), 688),
 (('NNP', 'NNP', 'NNP'), 671),
 (('PRP$', 'NN'), 624),
 (('POS', 'NN'), 550)]

In [233]:
# Regex based only on 20 most_common tag sequences
grammar_on_freq = r"""
  NP:   {<DT|POS|PRP\$|CD>?<JJ>*<NN(S|P)?>+}
        {<PRP>}
        {<$>?<CD>+}
        {<WDT>}

"""

cp_on_freq = nltk.RegexpParser(grammar_on_freq)

In [235]:
# Regex based only on 20 most freq tag sequences, scores quite good. 
# It has higher recall score, than the first parser I wrote above, which is not surprising
# This could be extended to 30 or 40 most_freq tag sequences and evaluated on conll_test_NP
print(cp_on_freq.evaluate(conll_test_NP))

ChunkParse score:
    IOB Accuracy:  88.6%%
    Precision:     77.9%%
    Recall:        79.2%%
    F-Measure:     78.6%%


__★ The baseline chunker presented in the evaluation section tends to create larger chunks than it should. For example, the phrase: [every/DT time/NN] [she/PRP] sees/VBZ [a/DT newspaper/NN] contains two consecutive chunks, and our baseline chunker will incorrectly combine the first two: [every/DT time/NN she/PRP]. Write a program that finds which of these chunk-internal tags typically occur at the start of a chunk, then devise one or more rules that will split up these chunks. Combine these with the existing baseline chunker and re-evaluate it, to see if you have discovered an improved baseline.__

In [236]:
for tree in conll_train_NP[:100]:
    for num in range(len(tree) - 1):
        try:
            tree[num].label()
            tree[num + 1].label()
        except AttributeError:
            pass
        else:
            print(tree[num], tree[num+1])

(NP release/NN) (NP tomorrow/NN)
(NP July/NNP and/CC August/NNP) (NP 's/POS near-record/JJ deficits/NNS)
(NP the/DT Exchequer/NNP) (NP Nigel/NNP Lawson/NNP)
(NP Nigel/NNP Lawson/NNP) (NP 's/POS restated/VBN commitment/NN)
(NP the/DT chancellor/NN) (NP 's/POS failure/NN)
(NP his/PRP$ Mansion/NNP House/NNP speech/NN) (NP last/JJ Thursday/NNP)
(NP Britain/NNP) (NP 's/POS manufacturing/NN industry/NN)
(NP August/NNP) (NP 's/POS unexpected/JJ decline/NN)
(NP interest/NN rates/NNS) (NP earlier/RBR this/DT month/NN)
(NP Mr./NNP Lawson/NNP) (NP 's/POS promise/NN)
(NP the/DT government/NN) (NP 's/POS popularity/NN)
(NP Mr./NNP Lawson/NNP) (NP 's/POS promise/NN)
(NP 's/POS promise/NN) (NP Friday/NNP)
(NP European/JJ trading/NN) (NP it/PRP)
(NP 2.9429/CD marks/NNS) (NP late/JJ Thursday/NNP)
(NP Friday/NNP) (NP 's/POS Market/NNP Activity/NN)
(NP the/DT coming/VBG week/NN) (NP the/DT foreign/JJ exchange/NN market/NN)
(NP 1.8470/CD marks/NNS) (NP late/JJ Thursday/NNP)
(NP New/NNP York/NNP) (NP late/

In [237]:
# I will continue improving my regex based on freq of tag sequences

grammar_on_freq_split = r"""
  NP:   {<DT|POS|PRP\$|CD>?<JJ>*<NN(S|P)?>+}
        {<PRP>}
        {<$>?<CD>+}
        {<WDT>}
        <NN(P|S)?>}{<POS>
        <NN(P|S)?>}{<PRP|JJ>
        <CD|NN(P|S)?>}{<DT><JJ>*<NN(P|S)?>

"""

cp_on_freq_split = nltk.RegexpParser(grammar_on_freq_split)

In [239]:
print(cp_on_freq_split.evaluate(conll_test_NP))     # no change in metrics

ChunkParse score:
    IOB Accuracy:  88.6%%
    Precision:     77.9%%
    Recall:        79.2%%
    F-Measure:     78.6%%


__★ Develop an NP chunker that converts POS-tagged text into a list of tuples, where each tuple consists of a verb followed by a sequence of noun phrases and prepositions, e.g. the little cat sat on the mat becomes ('sat', 'on', 'NP')...__

In [None]:
# TODO

__★ The Penn Treebank contains a section of tagged Wall Street Journal text that has been chunked into noun phrases. The format uses square brackets, and we have encountered it several times during this chapter. The Treebank corpus can be accessed using: for sent in nltk.corpus.treebank_chunk.chunked_sents(fileid). These are flat trees, just as we got using nltk.corpus.conll2000.chunked_sents().__

The functions nltk.tree.pprint() and nltk.chunk.tree2conllstr() can be used to create Treebank and IOB strings from a tree. Write functions chunk2brackets() and chunk2iob() that take a single chunk tree as their sole argument, and return the required multi-line string representation.

Write command-line conversion utilities bracket2iob.py and iob2bracket.py that take a file in Treebank or CoNLL format (resp) and convert it to the other format. (Obtain some raw Treebank or CoNLL data from the NLTK Corpora, save it to a file, and then use for line in open(filename) to access it from Python.)

In [253]:
treebank = nltk.corpus.treebank_chunk.chunked_sents()

In [320]:
# I saved the code as my_conll_print.py
import my_conll_print

In [321]:
my_conll_print.pprint_tree(treebank[10])

(S
  ''/''
  (NP Neither/DT Lorillard/NNP)
  nor/CC
  (NP the/DT researchers/NNS)
  (NP who/WP)
  studied/VBD
  (NP the/DT workers/NNS)
  were/VBD
  aware/JJ
  of/IN
  (NP any/DT research/NN)
  on/IN
  (NP smokers/NNS)
  of/IN
  (NP the/DT)
  Kent/NNP
  (NP cigarettes/NNS)
  ./.)


In [322]:
my_conll_print.pprint_iob(treebank[0])


Pierre NNP B-NP
Vinken NNP I-NP
, , O
61 CD B-NP
years NNS I-NP
old JJ O
, , O
will MD O
join VB O
the DT B-NP
board NN I-NP
as IN O
a DT B-NP
nonexecutive JJ I-NP
director NN I-NP
Nov. NNP I-NP
29 CD I-NP
. . O


__★ An n-gram chunker can use information other than the current part-of-speech tag and the n-1 previous chunk tags. Investigate other models of the context, such as the n-1 previous part-of-speech tags, or some combination of previous chunk tags along with previous and following part-of-speech tags.__

In [326]:
class ConsecutiveNPChunkTagger(nltk.TaggerI):

    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = npchunk_features(untagged_sent, i, history)
                train_set.append( (featureset, tag) )
                history.append(tag)
        self.classifier = nltk.MaxentClassifier.train(
            train_set, trace=0)

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = npchunk_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

class ConsecutiveNPChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        tagged_sents = [[((w,t),c) for (w,t,c) in
                         nltk.chunk.tree2conlltags(sent)]
                        for sent in train_sents]
        self.tagger = ConsecutiveNPChunkTagger(tagged_sents)

    def parse(self, sentence):
        tagged_sents = self.tagger.tag(sentence)
        conlltags = [(w,t,c) for ((w,t),c) in tagged_sents]
        return nltk.chunk.conlltags2tree(conlltags)

In [None]:
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    return {"pos": pos}

chunker = ConsecutiveNPChunker(conll_train)
print(chunker.evaluate(conll_test))