In [13]:
from IPython.core.display import HTML

HTML("<style>" + open("style.css").read() + "</style>")

# Bigram Tagger

<div class="task_description">
    <i class="task">Task 1:</i> <br>
</div>

Implement a bigram tagger in the same way as presented in the lecture (Slide 31). Keep in mind to handle the beginning of a sentence properly as the first token has no previous tag.

In [9]:
## SOLUTION ##

import nltk

class MyBigramTagger:
    """A simple bigram tagger"""
    
    def __init__(self, train_sents):
        """Build a ConditionalFreqDist. Conditions: Context tuple => previous tag and current word.
                                        Values: The current tag. Build a FreqDist for each context.
        """
        self.cfd = nltk.ConditionalFreqDist(
            ((prev_tag, current_word), current_tag)
            for sent in train_sents
            for (prev_word, prev_tag), (current_word, current_tag)
            in nltk.bigrams(self.__add_BOS(sent)))
        
    def __add_BOS(self, s):
        """Add <BOS> and dummy tag.
        To avoid problems of the first token which has no previous tag.
        Explicit method to make this clear"""
        return [("<BOS>","BOS")] + s
    
    def tag_a_sent(self, s):
        """Returns the tagged sentence of s. s needs to be in tokenized form"""
        prev_tag = "BOS" # Sentences allways start with the dummy tag.
        sent_tagged = list()
        
        for word in s:
            try:
                tag = self.cfd[(prev_tag, word)].max()
                sent_tagged.append((word, tag))
                prev_tag = tag
            except ValueError as e:
                sent_tagged.append((word, "UNKNOWN"))
                prev_tag = "UNKNOWN"
                
        return sent_tagged
                

A given unigram tagger:

In [10]:
class MyUnigramTagger:
    """A simple look up tagger"""
    
    def __init__(self, train_sents):
        self.cfd = nltk.ConditionalFreqDist(
            (word, tag)
            for sent in train_sents
            for (word, tag) in sent)
        
    def tag_a_sent(self, s):
        """Returns the tagged sentence of s. s needs to be in tokenized form"""
        sent_tagged = list()
        
        for word in s:
            try:
                tag = self.cfd[word].max()
                sent_tagged.append((word, tag))
            except ValueError as e:
                sent_tagged.append((word, "UNKNOWN"))
        
        return sent_tagged
        

<div class="task_description">
    <i class="task">Task 2:</i> <br>
</div>

Train both taggers with the tagged sentences found in the brown corpus. Use the universal tagset.

In [11]:
## SOLUTION ##

my_unigram = MyUnigramTagger(nltk.corpus.brown.tagged_sents(tagset="universal"))
my_bigram = MyBigramTagger(nltk.corpus.brown.tagged_sents(tagset="universal"))  

<div class="task_description">
    <i class="task">Task 3:</i> <br>
</div>

Find an example where both taggers succeed and one example where the bigram tagger is more accurate than the unigram tagger.

In [23]:
## SOLUTION ##

sent1 = "The display of my phone is broken."
sent2 = "I want to display this text message."

print(my_unigram.tag_a_sent(nltk.word_tokenize(sent1)),
      my_bigram.tag_a_sent(nltk.word_tokenize(sent1)),
      my_unigram.tag_a_sent(nltk.word_tokenize(sent2)),
      my_bigram.tag_a_sent(nltk.word_tokenize(sent2)),
      sep='\n'*2)


[('The', 'DET'), ('display', 'NOUN'), ('of', 'ADP'), ('my', 'DET'), ('phone', 'NOUN'), ('is', 'VERB'), ('broken', 'VERB'), ('.', '.')]

[('The', 'DET'), ('display', 'NOUN'), ('of', 'ADP'), ('my', 'DET'), ('phone', 'NOUN'), ('is', 'VERB'), ('broken', 'VERB'), ('.', '.')]

[('I', 'PRON'), ('want', 'VERB'), ('to', 'PRT'), ('display', 'NOUN'), ('this', 'DET'), ('text', 'NOUN'), ('message', 'NOUN'), ('.', '.')]

[('I', 'PRON'), ('want', 'VERB'), ('to', 'PRT'), ('display', 'VERB'), ('this', 'DET'), ('text', 'NOUN'), ('message', 'NOUN'), ('.', '.')]
