# Tests for HanTa Dutch

Some simple tests to check whether everything is doing what it is suposed to do.

In [1]:
import sys
sys.path.insert(1, '..')
import HanoverTagger as ht #Do not import form the package but from the parent folder where the latest source file is found

tagger = ht.HanoverTagger(r'../morphmodel_dutch.pgz')

## A number of arbitrary examples

In [2]:
tagger.analyze('waren',taglevel=1)

('zijn', 'WW(pv,verl,mv)')

In [3]:
tagger.analyze('ondergelopene',taglevel=2)

('onder+ge+lop+en+e', 'WW(vd,nom,met-e,mv-n)')

In [4]:
tagger.tag_word('voorspellingsmogelijkheden')

[('N(soort,mv,basis)', -30.190317050804087)]

In [5]:
tagger.analyze('voorspellingsmogelijkheden',taglevel=3 )

('voorspellingsmogelijkheid',
 [('voor', 'PTK'),
  ('spell', 'WW'),
  ('ing', 'SUF_WW_SUBST'),
  ('s', 'LI'),
  ('mogelijk', 'ADJ'),
  ('hed', 'SUF_A_SUBST'),
  ('en', 'SUF_N_N')],
 'N(soort,mv,basis)')

In [6]:
tagger.analyze('voorspellingsmogelijkheden',taglevel=2 )

('voor+spell+ing+s+mogelijk+heid+en', 'N(soort,mv,basis)')

In [7]:
tagger.analyze('syrisch-orthodoxe',taglevel=3)

('syrisch-orthodox',
 [('syrisch', 'ADJ'),
  ('-', 'LI(symb)'),
  ('orthodox', 'ADJ'),
  ('e', 'SUF_ADJ_E')],
 'ADJ(prenom,basis,met-e,stan)')

In [8]:
print(tagger.analyze('mannen',taglevel=0))
print(tagger.analyze('mannen',taglevel=1))
print(tagger.analyze('mannen',taglevel=2))
print(tagger.analyze('mannen',taglevel=3))

N(soort,mv,basis)
('man', 'N(soort,mv,basis)')
('man+en', 'N(soort,mv,basis)')
('man', [('mann', 'N_VAR(soort,zijd)'), ('en', 'SUF_N_N')], 'N(soort,mv,basis)')


In [9]:
print(tagger.analyze('verliepen',taglevel=0))
print(tagger.analyze('verliepen',taglevel=1))
print(tagger.analyze('verliepen',taglevel=2))
print(tagger.analyze('verliepen',taglevel=3))

WW(pv,verl,mv)
('verlopen', 'WW(pv,verl,mv)')
('ver+lop+en', 'WW(pv,verl,mv)')
('verlop', [('ver', 'PTK'), ('liep', 'WW_VAR(verl)'), ('en', 'SUF_WW(mv)')], 'WW(pv,verl,mv)')


In [10]:
print(tagger.analyze('dook',taglevel=0))
print(tagger.analyze('dook',taglevel=1))
print(tagger.analyze('dook',taglevel=2))
print(tagger.analyze('dook',taglevel=3))

WW(pv,verl,ev)
('duiken', 'WW(pv,verl,ev)')
('duik', 'WW(pv,verl,ev)')
('duik', [('dook', 'WW_VAR(verl)')], 'WW(pv,verl,ev)')


In [11]:
print(tagger.analyze('opgebouwd',taglevel=0))
print(tagger.analyze('opgebouwd',taglevel=1))
print(tagger.analyze('opgebouwd',taglevel=2))
print(tagger.analyze('opgebouwd',taglevel=3))

WW(vd,vrij,zonder)
('opbouwen', 'WW(vd,vrij,zonder)')
('op+ge+bouw+d', 'WW(vd,vrij,zonder)')
('opbouw', [('op', 'PTK'), ('ge', 'PREF_VD'), ('bouw', 'WW'), ('d', 'SUF_VD')], 'WW(vd,vrij,zonder)')


In [12]:
import nltk

zin1 = 'De kinderen waren met een voetbal aan het voetballen.'
zin2 = 'De BMM beschikt over een beeldverwerkingstation.'

woorden = nltk.word_tokenize(zin1)
tagger.tag_sent(woorden,taglevel=1)

[('De', 'de', 'LID(bep,stan,rest)'),
 ('kinderen', 'kind', 'N(soort,mv,basis)'),
 ('waren', 'zijn', 'WW(pv,verl,mv)'),
 ('met', 'met', 'VZ(init)'),
 ('een', 'een', 'LID(onbep,stan,agr)'),
 ('voetbal', 'voetbal', 'N(soort,ev,basis,onz,stan)'),
 ('aan', 'aan', 'VZ(init)'),
 ('het', 'het', 'LID(bep,stan,evon)'),
 ('voetballen', 'voetballen', 'WW(inf,nom,zonder,zonder-n)'),
 ('.', '.', 'LET()')]

In [13]:
woorden = nltk.word_tokenize(zin2)
tagger.tag_sent(woorden,taglevel=1)

[('De', 'de', 'LID(bep,stan,rest)'),
 ('BMM', 'Bmm', 'N(eigen,ev,basis,zijd,stan)'),
 ('beschikt', 'beschikken', 'WW(pv,tgw,met-t)'),
 ('over', 'over', 'VZ(init)'),
 ('een', 'een', 'LID(onbep,stan,agr)'),
 ('beeldverwerkingstation',
  'beeldverwerkingstation',
  'N(soort,ev,basis,onz,stan)'),
 ('.', '.', 'LET()')]

In [14]:
tagger.analyze('ijsbaantje',taglevel=3,pos = 'N(soort,ev,dim,onz,stan)')

('ijsbaan',
 [('ijs', 'N(soort,onz)'), ('baan', 'N(soort,zijd)'), ('tje', 'SUF_DIM')],
 'N(soort,ev,dim,onz,stan)')

In [15]:
tagger.analyze('voorbereiding',taglevel=3)

('voorbereiding',
 [('voor', 'PTK'),
  ('be', 'PREF_WW'),
  ('reid', 'WW_KERN'),
  ('ing', 'SUF_WW_SUBST')],
 'N(soort,ev,basis,zijd,stan)')

In [16]:
tagger.analyze('bodemvochtigheid',taglevel=3)

('bodemvochtigheid',
 [('bodem', 'N(soort,zijd)'), ('vochtig', 'ADJ'), ('heid', 'SUF_SUBST')],
 'N(soort,ev,basis,zijd,stan)')

In [17]:
tagger.tag_word('bodemvochtigheid')

[('N(soort,ev,basis,zijd,stan)', -25.125872389479067)]

In [18]:
tagger.tag_word('luchtvochtigheid')

[('N(soort,ev,basis,zijd,stan)', -13.715142898934982)]

In [19]:
tagger.tag_word('ware')

[('ADJ(prenom,basis,met-e,stan)', -10.94748909209477),
 ('WW(pv,conj,ev)', -11.30711137698963),
 ('ADJ(nom,basis,met-e,zonder-n,stan)', -13.113990940896546)]

In [20]:
tagger.analyze('onschatbare', taglevel = 3)

('onschatbaar',
 [('on', 'PREF_NEG'),
  ('schat', 'WW_VAR'),
  ('bar', 'SUF_WW_ADJ'),
  ('e', 'SUF_ADJ_E')],
 'ADJ(prenom,basis,met-e,stan)')

In [21]:
tagger.analyze('ingebed', taglevel = 3)

('inbidd',
 [('in', 'PTK'), ('ge', 'PREF_VD'), ('bed', 'WW_VAR(vd)')],
 'WW(vd,vrij,zonder)')

In [22]:
tagger.analyze("beïnvloedt", taglevel = 3)

('beinvloed',
 [('be', 'PREF_WW'), ('ïnvloed', 'WW_KERN'), ('t', 'SUF_WW(met-t)')],
 'WW(pv,tgw,met-t)')

In [23]:
tagger.analyze("zandwinningsschepen", taglevel = 3)

('zandwinningsschip',
 [('zand', 'N(soort,onz)'),
  ('winn', 'WW'),
  ('ing', 'SUF_WW_SUBST'),
  ('s', 'LI'),
  ('schep', 'N_VAR(soort,onz)'),
  ('en', 'SUF_N_N')],
 'N(soort,mv,basis)')

## Evaluate tagging and lemmatising on train data

This is ofcourse not a proper evaluation as we use the training data to evaluate the program. However, if we don't get good results here, something went wrong for sure. The numbers we get here might give a kind of upperbound for what we can ecpect from a real evaluation.

Evaluation of tagging and lemmatisation is not a trivial task. First, the evaluation data might have been tagged according to some othe tagging scheme. A lot of different decisions can be made about the tags and lemmata for many unclear cases. Moreover most data sets contain a large number of errors. Often these originate form the tagger/stemmer/lemmatiser that was used for the initial annotation and were overlooked in the manual correction phases.  

In [24]:
def load(lines):
    data = []
    sent = []
    lastsentnr = 1
    for line in lines:
        (sentnr,word,lemma,stem,tag,morphemes,stemsub) = line.split('\t')
        if sentnr != lastsentnr:
            if len(sent) > 0:
                data.append((sentnr,sent))
            sent = []
            lastsentnr = sentnr
        sent.append((word,lemma,tag))
    return data

In [25]:
import codecs

datafile = codecs.open(r"labeledmorph_dutch.csv", "r","utf-8")

morphdata = []
for line in datafile:
    if not line.startswith('-1'):
        morphdata.append(line)
    

In [26]:
testdata = load(morphdata)

### Evaluate the POS Tagging

In [27]:
def tag_evaluate(sents):
    correct = 0
    nr = 0

    for snr,sent in sents:
        ws = [w for (w,l,c) in sent]
        cs = [c for (w,l,c) in sent]
        pred_cs = tagger.tag_sent(ws,taglevel = 0)
        for i in range(len(ws)): 
            nr += 1
            if cs[i] == pred_cs[i]:
                correct += 1
            #else:
            #    c = cs[i].split('(')[0]
            #    pred_c = pred_cs[i].split('(')[0]
            #    if c != pred_c:
            #        print(snr,'\t',' '.join(ws))
            #        print(ws[i],cs[i],pred_cs[i])
            #        print()
        if nr%50 == 0:
            print(correct/nr,end='\r')
    return correct/nr

In [28]:
tag_evaluate(testdata)

0.9826681614349776

0.9826657254929992

### Evaluate lemmatisation

In [30]:
def lemma_evaluate(sents):
    correct = 0
    nr = 0

    for snr,sent in sents:
        ws = [w for (w,l,c) in sent]
        ls = [l for (w,l,c) in sent]
        pred_ls = [l for _,l,_ in tagger.tag_sent(ws,taglevel = 1)]
        for i in range(len(ws)): 
            nr += 1
            if ls[i] == pred_ls[i]:
                correct += 1
            #else:
            #    print(snr,'\t',' '.join(ws))
            #    print(ws[i],ls[i],pred_ls[i])
            #    print()
        if nr%50 == 0:
            print(correct/nr,end='\r')
    return correct/nr

In [31]:
lemma_evaluate(testdata)


0.9689237668161435

0.9689323337830714