# Practice stuff
This is taken from NLTK chapter 7, with slight modifications :) Note that this overlaps quite a bit with the next lab on information extraction - so it pays off to read through this chapter and experiment a bit!

In [None]:
import nltk

In [None]:
# define a helper function for a given chunker, that tokenizes -> pos tags
def parse_sent(parser, sent):
    return parser.parse(nltk.pos_tag(nltk.word_tokenize(sent)))

In [None]:
sentence = "the little bear saw the fine fat trout in the brook"

In [None]:
grammar = r"""
NP: {<DT><JJ><NN>} # DT (optional adjective) followed by NN
"""
cp = nltk.RegexpParser(grammar)
parse_sent(cp, sentence)

This isn't quite right. "The fine fat" is now a NP, not "the fine fat trout".. We must allow multiple nouns for NPs (add a +)!

Also, let's incorporate optional adjectives, such that "the brook" becomes a NP (? after the definition)

In [None]:
grammar = r"""
NP: {<DT><JJ>?<NN>+} # DT (optional adjective) followed by NN
"""
cp = nltk.RegexpParser(grammar)
parse_sent(cp, sentence)

Fixed! Now we need to support VPs as we saw in the lecture. VP = V NP or PP

In [None]:
grammar = r"""
NP: {<DT><JJ>?<NN.*>+} # DT (optional adjective) followed by NN
PP: {<IN><NP>}         # prepositions followed by NP
VP: {<VB.*><NP|PP>+}   # match one or more of NP or PP
"""
cp = nltk.RegexpParser(grammar)
parse_sent(cp, sentence)

In [None]:
sentence = "the girl that worked at the university ntnu"
parse_sent(cp, sentence)

## Explore subtrees!
Use the brown corpus, iterate a few sentences and parse them with the grammar:

any verb &rarr; TO &rarr; any verb

In [None]:
cp = nltk.RegexpParser('chonker: {<V.*> <TO> <V.*>}')
brown = nltk.corpus.brown
for sent in brown.tagged_sents()[:200]:  # <-- limit to avoid print spam
    tree = cp.parse(sent)
    for subtree in tree.subtrees():
        if subtree.label() == 'chonker':
            print(" ".join([w for w, POS in subtree.leaves()]))

## Reverse the operation with Chinks!

In [None]:
grammar = r"""
  NP:
    {<.*>+}          # Chunk everything
    }<V.*|IN>+{      # Chink sequences of V and IN
  """
cp = nltk.RegexpParser(grammar)
sentence = "the little bear saw the fine fat trout in the brook"
parse_sent(cp, sentence)

Observe how the `DT JJ JJ NN` sequence is now a NN because of the VBD|IN chink!

We also chunked "the brook" correctly. Amazing!

# Evaluation with predfined chunked sentences
Using annotated data sets, you can create your own chunker and evaluate it on true sentences :)

In [None]:
wsj = nltk.corpus.conll2000
wsj.chunked_sents("train.txt")[10]

## specify chunk types
only chunks on NPs. Note how the VP above is now not chunked.

In [None]:
wsj.chunked_sents("train.txt", chunk_types=["NP"])[10]

In [None]:
cp = nltk.RegexpParser("")  # empty regex parser
train_sents = wsj.chunked_sents('train.txt', chunk_types=['NP'])
test_sents = wsj.chunked_sents('test.txt', chunk_types=['NP'])
print(cp.accuracy(test_sents))

In [None]:
grammar = r"NP: {<[CDJNP].*>+}"  # populate some rules
cp = nltk.RegexpParser(grammar)
print(cp.accuracy(test_sents))

## Improve with a custom chunker!
use the training corpus to find the chunk tag (I, O, or B) that is most likely for each part-of-speech tag. In other words, we can build a chunker using a unigram tagger. But rather than trying to determine the correct part-of-speech tag for each word, we are trying to determine the correct chunk tag, given each word's part-of-speech tag.

In [None]:
class NgramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents, tagger=nltk.UnigramTagger):
        train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
        self.tagger = tagger(train_data)

    def parse(self, sentence):
        pos_tags = [pos for (word,pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
                     in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)

In [None]:
ngram_chunker = NgramChunker(
    train_sents,
    tagger=nltk.BigramTagger
    # can be unigram, trigram, or whatever you implemented in earlier labs
)

In [None]:
print(ngram_chunker.accuracy(test_sents))

## Recursion

In [None]:
grammar = r"""
  NP: {<DT|JJ|NN.*>+}          # Chunk sequences of DT, JJ, NN
  PP: {<IN><NP>}               # Chunk prepositions followed by NP
  VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
  CLAUSE: {<NP><VP>}           # Chunk NP, VP
  """
cp = nltk.RegexpParser(grammar)
txt = "Mary saw the cat sit on the mat"
parse_sent(cp, txt)

In [None]:
txt = "John thinks Mary saw the cat sit on the mat"
parse_sent(cp, txt)

In [None]:
cp = nltk.RegexpParser(grammar, loop=2)
parse_sent(cp, txt)

# tree traversal
a generic traverse algo

In [None]:
def traverse(t):
    try:
        t.label()
    except AttributeError:
        print(t, end=" ")
    else:
        # Now we know that t.node is defined
        print('(', t.label(), end=" ")
        for child in t:
            traverse(child)
        print(')', end=" ")

In [None]:
tree = parse_sent(cp, txt)
traverse(tree)

# named entity recognition!

In [None]:
sent = nltk.corpus.treebank.tagged_sents()[1]
print(" ".join([w for w, pos in sent]))

In [None]:
nltk.ne_chunk(sent, binary=True)

In [None]:
nltk.ne_chunk(sent)  # note how this separates singular entities

In [None]:
import re
pattern = re.compile(r'.*\bin\b(?!\b.+ing)')
for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
    for rel in nltk.sem.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=pattern):
        print(nltk.sem.rtuple(rel))

In [None]:
from nltk.corpus import conll2002
vnv = """
(
is/V|    # 3rd sing present and
was/V|   # past forms of the verb zijn ('be')
werd/V|  # and also present
wordt/V  # past of worden ('become)
)
.*       # followed by anything
van/Prep # followed by van ('of')
"""
VAN = re.compile(vnv, re.VERBOSE)
for doc in conll2002.chunked_sents('ned.train'):
    for rel in nltk.sem.extract_rels('PER', 'ORG', doc,
                                     corpus='conll2002', pattern=VAN):
        print(nltk.sem.clause(rel, relsym="VAN"))
        print(nltk.rtuple(rel, lcon=True, rcon=True))

# Making use of treebanks

In [None]:
from nltk.corpus import treebank
t = treebank.parsed_sents('wsj_0001.mrg')[0]
t

Create a production rule of the form VP &rarr; S

In [None]:
def vp_s(tree):
    child_nodes = [child.label() for child in tree
                   if isinstance(child, nltk.Tree)]
    return  (tree.label() == 'VP') and ('S' in child_nodes)

def np_pp(tree):
    child_nodes = [child.label() for child in tree
                   if isinstance(child, nltk.Tree)]
    return  (tree.label() == 'NP') and ('PP' in child_nodes)

In [None]:
sents = treebank.parsed_sents()[:10]  # sample from first 10

def np_pp(tree):
    child_nodes = [child.label() for child in tree
                   if isinstance(child, nltk.Tree)]
    return  (tree.label() == 'NP') and ('PP' in child_nodes)
for tree in sents:
    for subtree in tree.subtrees(vp_s):
        print(subtree)


In [None]:
for tree in sents:
    for subtree in tree.subtrees(np_pp):
        print(subtree)