In [3]:
import nltk

from nltk.metrics import ConfusionMatrix

In [4]:
# Read the Tiger annotated Stuttgart Tübingen tag set (STTS) as Universal tagset
def read_stts_file_as_universal_tagset(path):
    # Tagset: Stuttgart Tübingen tag set aka STTS to Universal
    # Documentation: https://www.nltk.org/api/nltk.tag.html#module-nltk.tag.mapping
    # Input mapping page: https://github.com/slavpetrov/universal-pos-tags/blob/master/de-tiger.map
    # Output mapping page: https://github.com/slavpetrov/universal-pos-tags/blob/master/universal_tags.py
    universal_tagset = nltk.tag.tagset_mapping('de-tiger', 'universal') # NLTK <3

    tagged_sentences = []
    
    with open(path) as file:
        for index, sentence in enumerate(file):
            tagged_sentences.append([])
            for textblock in sentence.split(";"):
                textblock = textblock.strip() # Trim leading or trailing whitespaces
                touple = textblock.split("/")
                if(len(touple) > 1): # Check the length to prevent a crash at the beginning
                    if (not textblock.startswith("//")):
                        tagged_sentences[index].append((touple[0], universal_tagset[touple[1]]))
                    else:
                        textvalue = ""
                        textblock = textblock[2:] # Strip away the leading //
                        
                        if textblock == "$,":
                            textvalue = ","
                        elif textblock == "$.":
                            textvalue = "."
                        elif textblock == "$(":
                            textvalue = "-"
                        else:
                            print("Malformed tag: ", textblock)

                        tagged_sentences[index].append((textvalue, universal_tagset[textblock]))
                #else:
                    #print("Missed:", index, len(touple), textblock)

    return tagged_sentences


In [5]:
tagged_sentences_train = read_stts_file_as_universal_tagset("POS_German_train.txt")
tagged_sentences_minitest = read_stts_file_as_universal_tagset("POS_German_minitest.txt")

In [6]:
length = int(len(tagged_sentences_train) * 0.9)
real_train_sentences = tagged_sentences_train[:length]
real_eval_sentences = tagged_sentences_train[length:]
real_minitest = tagged_sentences_minitest

In [7]:
print(real_train_sentences[0])
print(real_eval_sentences[0])
print(real_minitest[0])

[('``', '.'), ('Ross', 'NOUN'), ('Perot', 'NOUN'), ('waere', 'VERB'), ('vielleicht', 'ADV'), ('ein', 'DET'), ('praechtiger', 'ADJ'), ('Diktator', 'NOUN'), ("''", '.')]
[('Hinzu', 'PRT'), ('kommen', 'VERB'), ('Sprachprobleme', 'NOUN'), ('und', 'CONJ'), ('die', 'DET'), ('Unkenntnis', 'NOUN'), ('des', 'DET'), ('deutschen', 'ADJ'), ('Rechts', 'NOUN'), ('.', '.')]
[('BONN', 'NOUN'), (',', '.'), ('10.', 'ADJ'), ('Maerz', 'NOUN'), ('(', '.'), ('dpa', 'NOUN'), (')', '.'), ('.', '.')]


In [8]:
# Good & simple overview: https://www.kaggle.com/saxinou/nlp-02-categorizing-and-tagging-words
def create_tagger(sentences):
    default_tagger = nltk.DefaultTagger("NOUN")
    regexp_tagger = nltk.RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'NUM')], backoff=default_tagger)
    unigram_tagger = nltk.UnigramTagger(sentences, backoff=regexp_tagger)
    bigram_tagger = nltk.BigramTagger(sentences, backoff=unigram_tagger)
    trigram_tagger = nltk.BigramTagger(sentences, backoff=bigram_tagger)
    return (default_tagger, regexp_tagger, unigram_tagger, bigram_tagger, trigram_tagger)

In [9]:
def evaluate_tagger(default_tagger, regexp_tagger, unigram_tagger, bigram_tagger, trigram_tagger, sentences):
    print("Evaluate Default: ", default_tagger.evaluate(sentences))
    print("Evaluate RegExp: ", regexp_tagger.evaluate(sentences))
    print("Evaluate Unigram: ", unigram_tagger.evaluate(sentences))
    print("Evaluate Bigram: ", bigram_tagger.evaluate(sentences))
    print("Evaluate Trigram: ", trigram_tagger.evaluate(sentences))
    return trigram_tagger.evaluate(sentences)

In [10]:
def tag_sentence(default_tagger, regexp_tagger, unigram_tagger, bigram_tagger, trigram_tagger, sentence):
    tokens = nltk.word_tokenize(sentence)
    print("Test Default: ", default_tagger.tag(tokens))
    print("Test RegExp: ", regexp_tagger.tag(tokens))
    print("Test Unigram: ", unigram_tagger.tag(tokens))
    print("Test Bigram: ", bigram_tagger.tag(tokens))
    print("Test Trigram: ", bigram_tagger.tag(tokens))

In [11]:
default_tagger, regexp_tagger, unigram_tagger, bigram_tagger, trigram_tagger = create_tagger(real_train_sentences)

In [12]:
# Note: Trigram to bigram is a 1/100 better
evaluate_tagger(default_tagger, regexp_tagger, unigram_tagger, bigram_tagger, trigram_tagger, real_eval_sentences)

Evaluate Default:  0.2728484715403799
Evaluate RegExp:  0.2921247862979751
Evaluate Unigram:  0.950985774111078
Evaluate Bigram:  0.956051884498957
Evaluate Trigram:  0.956051884498957


0.956051884498957

In [13]:
# Note: Trigram to bigram is only a 1/1000 better
evaluate_tagger(default_tagger, regexp_tagger, unigram_tagger, bigram_tagger, trigram_tagger, real_minitest)

Evaluate Default:  0.2765736684344017
Evaluate RegExp:  0.28890938436707403
Evaluate Unigram:  0.9468526631311966
Evaluate Bigram:  0.9519252939820152
Evaluate Trigram:  0.9519252939820152


0.9519252939820152

In [14]:
# Check: https://wortarten.info/
sentence = "Ich trinke immer gerne ein kühles Weizenbier während ich auf dem Sofa liege und Musik höre. Auch im Jahre 2018!"
tag_sentence(default_tagger, regexp_tagger, unigram_tagger, bigram_tagger, trigram_tagger, sentence)

Test Default:  [('Ich', 'NOUN'), ('trinke', 'NOUN'), ('immer', 'NOUN'), ('gerne', 'NOUN'), ('ein', 'NOUN'), ('kühles', 'NOUN'), ('Weizenbier', 'NOUN'), ('während', 'NOUN'), ('ich', 'NOUN'), ('auf', 'NOUN'), ('dem', 'NOUN'), ('Sofa', 'NOUN'), ('liege', 'NOUN'), ('und', 'NOUN'), ('Musik', 'NOUN'), ('höre', 'NOUN'), ('.', 'NOUN'), ('Auch', 'NOUN'), ('im', 'NOUN'), ('Jahre', 'NOUN'), ('2018', 'NOUN'), ('!', 'NOUN')]
Test RegExp:  [('Ich', 'NOUN'), ('trinke', 'NOUN'), ('immer', 'NOUN'), ('gerne', 'NOUN'), ('ein', 'NOUN'), ('kühles', 'NOUN'), ('Weizenbier', 'NOUN'), ('während', 'NOUN'), ('ich', 'NOUN'), ('auf', 'NOUN'), ('dem', 'NOUN'), ('Sofa', 'NOUN'), ('liege', 'NOUN'), ('und', 'NOUN'), ('Musik', 'NOUN'), ('höre', 'NOUN'), ('.', 'NOUN'), ('Auch', 'NOUN'), ('im', 'NOUN'), ('Jahre', 'NOUN'), ('2018', 'NUM'), ('!', 'NOUN')]
Test Unigram:  [('Ich', 'PRON'), ('trinke', 'VERB'), ('immer', 'ADV'), ('gerne', 'ADV'), ('ein', 'DET'), ('kühles', 'NOUN'), ('Weizenbier', 'NOUN'), ('während', 'NOUN'), 

In [15]:
def plot_confusion_matrix(path):
    tagged_sentences = read_stts_file_as_universal_tagset(path)

    flatted_tags = [item for sublist in tagged_sentences for item in sublist]
    tags = [touple[1] for touple in flatted_tags]

    flatted_words = [item for sublist in tagged_sentences for item in sublist]
    words = [touple[0] for touple in flatted_words]
    
    tagged_words = bigram_tagger.tag(words)
    real_tagged_tags = [touple[1] for touple in tagged_words]

    confusion_matrix = ConfusionMatrix(tags, real_tagged_tags)

    print("Accuracy through evaluation: ", trigram_tagger.evaluate(tagged_sentences))
    print(confusion_matrix.pretty_format(show_percents=True))
    print(confusion_matrix)

In [16]:
plot_confusion_matrix("POS_German_minitest.txt")

Accuracy through evaluation:  0.9519252939820152
     |                                  C             N             P             V        |
     |             A      A      A      O      D      O      N      R      P      E        |
     |             D      D      D      N      E      U      U      O      R      R        |
     |      .      J      P      V      J      T      N      M      N      T      B      X |
-----+-------------------------------------------------------------------------------------+
   . | <13.6%>     .      .      .      .      .      .      .      .      .      .      . |
 ADJ |      .  <6.3%>     .   0.1%      .      .   1.5%      .      .   0.0%   0.1%   0.0% |
 ADP |      .      . <11.6%>  0.0%   0.1%      .   0.0%      .      .   0.3%      .   0.0% |
 ADV |      .   0.0%   0.2%  <3.9%>  0.1%      .   0.0%      .   0.1%   0.0%      .   0.0% |
CONJ |      .      .   0.1%   0.0%  <3.4%>     .      .      .   0.0%      .      .      . |
 DET |      .      . 