# (1) TOWN Gazetteer + GESAMTPREIS Regex

In [16]:
import json
import pandas as pd
import re
from collections import Counter 
df = pd.read_csv('townsVoralberg.csv',names=['towns'])  

In [5]:
# output town gazetteer
gemli = []
for i in df.index:
    di = {}
    di['label'] = 'ORT'
    di['pattern'] = [{'lower': df['towns'][i].lower()}]
    gemli.append(di)
gemli.append({"label": "GESAMTPREIS", "pattern":[{"TEXT": {"REGEX":"^(\\d\\.?)?(\\d{3}\\.?\\d{3})?$"}}]})
with open('pattern_ort2.jsonl', 'w') as outfile:
    for entry in gemli:
        json.dump(entry, outfile)
        outfile.write('\n')

# (2) Clean-up Annotation Inconsistencies

This part of code changes file annotations.jsonl to remove last trailing non-alphanumeric character but only for those tokens that are the last in a chain of (possibly many) tokens classified in same class. In other words we leave unchanged non-alphanumeric characters that either don't appear in last character of token or appear in last character of a token that is part of a group of successive tokens all classified in the same label (named entity). For e.g. we want to change labelling of following tokens "ABC GmbH." so as to only label "ABC GmbH" (i.e. removing the trailing ".")

In [44]:
labels = ['ORT','STRASSE','FLAECHE','IMMO_TYP','GESAMTPREIS','TERRASSENGROESSE','KAEUFER','VERKAEUFER','DATUM_VERTRAG','DATUM_VERBUECHERUNG','QMPREIS']

In [193]:
with open('annotations.jsonl') as jsonl_file:
    lines = jsonl_file.readlines()
annot = [json.loads(line) for line in lines]

now use labels in annotations['spans'] to add them to token-level data: annotations['tokens']

In [195]:
def getSpansLabel(listDict,idxEnd): # this function returns the label from annot['spans'] that corresponds to index for end character
    # listDict = annot[k]['spans']
    # idxStart = start character from annot[k]['tokens'][i]['start']
    for i in range(0,len(listDict)):
        auxDict=listDict[i]
        a_s, a_e = auxDict['start'], auxDict['end']
        if idxEnd>=a_s: # i.e. in any case stop when idxStart < a_s (as dict tokens are ordered ASC in list; i.e. by order they appear in text)
            if (idxEnd>=a_s and idxEnd<=a_e): 
                return auxDict['label'] 
            elif i==len(listDict)-1: #case where idxStart > 'end' key of last dict in list
                if idxEnd>a_e:
                    return 'O'
        else: return 'O'

Apply labels from 'spans' dictionary to tokens for subsequent processing (e.g. in CRF model)

In [196]:
# now use getLabel function and loop through all items of annot list:
    
for j in range(0,len(annot)):
    a = annot[j]
    # select list of dict of tokens w/ annnotations and add column w/ no. of words to each dict:
    b = a['spans']
    # add noWords to b dict. note: b is list of dicts w/ annotations; tokens not on this list don't have annotations
    if b!=[]: #i.e. only try to add annotations to tokens if there are annotations to begin with
        for i in range(0,len(b)): 
            b[i]['noWords']=b[i]['token_end']-b[i]['token_start']+1
        #print(b)

        # select dict of tokens
        c = a['tokens']
        # add label to each token
        for i in range(0,len(c)):
            c[i]['label'] = getSpansLabel(b,c[i]['end'])
        annot[j]['tokens'] = c

Next cell shows Counter of annotations that shall be changed

In [197]:
# find situations where last token of same label ends with non-alphanumeric character
for l in range(0,len(labels)):
    out=[]
    for k in range(0,len(annot)):
        j=len(annot[k]['tokens'])
        for i in range(0,j-1):
            if annot[k]['spans']!=[]: # are there annot for this example?
                if annot[k]['tokens'][i]['label']==labels[l]:
                    if bool(re.match(r'\W',annot[k]['tokens'][i]['text'].lower()[-1:])) and annot[k]['tokens'][i]['label'] != annot[k]['tokens'][i+1]['label']:
                        out.append(annot[k]['tokens'][i]['text'].lower())
                        #print("k/"+str(k))
        # now add for i=j-1 (i.e. last token in example; by definition this is last token in label)
        if annot[k]['spans']!=[]: # are there annot for this example?
            if annot[k]['tokens'][j-1]['label']==labels[l]:
                if bool(re.match(r'\W',annot[k]['tokens'][j-1]['text'].lower()[-1:])):
                    out.append(annot[k]['tokens'][j-1]['text'].lower())
    out_count = Counter(out)
    print(out_count)

Counter()
Counter()
Counter()
Counter()
Counter({'1.': 1})
Counter()
Counter({'gmbh.': 22, 'mbh.': 1})
Counter({'gmbh.': 20, '..': 1})
Counter({'2020.': 3, '2019.': 2, '25.04.2019.': 1, '2012.': 1})
Counter({'2021.': 55, '2020.': 42, '2022.': 2, '2019.': 1})
Counter()


And now change the annotations programmatically

In [199]:
def getSpansIndex(listDict,idxEnd): # this function returns the index from annot['spans'] that corresponds to index for start character
    # listDict = annot[k]['spans']
    # idxStart = start character from annot[k]['tokens'][i]['start']
    for i in range(0,len(listDict)):
        auxDict=listDict[i]
        a_s, a_e = auxDict['start'], auxDict['end']
        if idxEnd>=a_s: # i.e. in any case stop when idxStart < a_s (as dict tokens are ordered ASC in list; i.e. by order they appear in text)
            if (idxEnd>=a_s and idxEnd<=a_e): 
                return i 
            elif i==len(listDict)-1: #case where idxStart > 'end' key of last dict in list
                if idxEnd>a_e:
                    return None
        else: return None

In [200]:
# find situations where last token of same label (i.e. next token has different label) ends with non-alphanumeric character
for k in range(0,len(annot)):
    j=len(annot[k]['tokens'])
    nAdded=0
    i=0
    if annot[k]['spans']!=[]: # are there annot for this example?
        while i < j-1+nAdded:
            if bool(re.match(r'\W',annot[k]['tokens'][i]['text'].lower()[-1:])) and annot[k]['tokens'][i]['label'] != annot[k]['tokens'][i+1]['label'] and annot[k]['tokens'][i]['label'] !='O':
                # NOTE: 'O' labels were only added programmatically above (and not added to 'spans') so we don't want to track them here
                auxSpansRow = getSpansIndex(annot[k]['spans'],annot[k]['tokens'][i]['end'])
                if len(annot[k]['tokens'][i]['text'])==1: # if token only has 1 char then replace label entirely by "O"
                    annot[k]['tokens'][i]['label']="O"
                    # remove one word from annot[]['spans']:
                    if annot[k]['spans'][auxSpansRow]['end']==annot[k]['spans'][auxSpansRow]['start']+1: #if class label only has this char
                        del annot[k]['spans'][auxSpansRow] # can remove this entry entirely from ['spans'] list
                    else:
                        # reduce 'end' by 1 char and also 'token_end' and noWords by 1 
                        annot[k]['spans'][auxSpansRow]['end'] = annot[k]['spans'][auxSpansRow]['end']-1
                        annot[k]['spans'][auxSpansRow]['token_end'] = annot[k]['spans'][auxSpansRow]['token_end']-1
                        annot[k]['spans'][auxSpansRow]['noWords'] = annot[k]['spans'][auxSpansRow]['noWords']-1
                else: # token has more than 1 character
                    # now add a separate token for this item
                    annot[k]['tokens'].append({'text': annot[k]['tokens'][i]['text'][-1:], 
                                               'start': annot[k]['tokens'][i]['end']-1, 
                                               'end': annot[k]['tokens'][i]['end'], 
                                               'id': annot[k]['tokens'][i]['id']+1, 
                                               'ws': True, 'label': 'O'})
                    nAdded += 1 
                    annot[k]['tokens'][i]['end']=annot[k]['tokens'][i]['end']-1 #update end index for this token
                    annot[k]['tokens'][i]['text']=annot[k]['tokens'][i]['text'][:-1] #update text for this token
                    # and ajust id's up by 1 on all remaining ones
                    for m in range(i+1,j-1+nAdded): 
                        annot[k]['tokens'][m]['id']=annot[k]['tokens'][m]['id']+1                         
                    # in this case only need to reduce end index of ['spans'] by 1; no tokens w/ label are unchanged
                    annot[k]['spans'][auxSpansRow]['end'] = annot[k]['spans'][auxSpansRow]['end']-1 
            i += 1
    # now add for i=j-1 (i.e. last token in example; by definition this is last token in label)
        lastRow=j-1+nAdded # this is the id of the last row of 'tokens' list
        if bool(re.match(r'\W',annot[k]['tokens'][lastRow]['text'].lower()[-1:])) and annot[k]['tokens'][i]['label'] !='O':
            # NOTE: 'O' labels were only added programmatically above (and not added to 'spans') so we don't want to track them here
            auxSpansRow = getSpansIndex(annot[k]['spans'],annot[k]['tokens'][lastRow]['start'])
            if len(annot[k]['tokens'][lastRow]['text'])==1: # if token only has 1 char then replace label entirely by "O"
                annot[k]['tokens'][lastRow]['label']="O"
                # remove one word from annot[]['spans']:
                if annot[k]['spans'][auxSpansRow]['end']==annot[k]['spans'][auxSpansRow]['start']+1: #if class label only has this char
                    del annot[k]['spans'][auxSpansRow] # can remove this entry entirely from ['spans'] list
                else:
                    # reduce 'end' by 1 char and also 'token_end' and noWords by 1 
                    annot[k]['spans'][auxSpansRow]['end'] = annot[k]['spans'][auxSpansRow]['end']-1
                    annot[k]['spans'][auxSpansRow]['token_end'] = annot[k]['spans'][auxSpansRow]['token_end']-1
                    annot[k]['spans'][auxSpansRow]['noWords'] = annot[k]['spans'][auxSpansRow]['noWords']-1
            else: # token has more than 1 character
                # now add a separate token for this item
                annot[k]['tokens'].append({'text': annot[k]['tokens'][i]['text'][-1:], 
                                           'start': annot[k]['tokens'][i]['end']-1, 
                                           'end': annot[k]['tokens'][i]['end'], 
                                           'id': annot[k]['tokens'][i]['id']+1, 
                                           'ws': True, 'label': 'O'}) 
                annot[k]['tokens'][lastRow]['end']=annot[k]['tokens'][lastRow]['end']-1 #update end index for this token
                annot[k]['tokens'][lastRow]['text']=annot[k]['tokens'][lastRow]['text'][:-1] #update text for this token
                # in this case only need to reduce end index of ['spans'] by 1; no tokens w/ label are unchanged
                annot[k]['spans'][auxSpansRow]['end'] = annot[k]['spans'][auxSpansRow]['end']-1 

And next cell shows effect of this change on the Counter we ran above

In [201]:
# find situations where last token of same label ends with non-alphanumeric character
for l in range(0,len(labels)):
    out=[]
    for k in range(0,len(annot)):
        j=len(annot[k]['tokens'])
        for i in range(0,j-1):
            if annot[k]['spans']!=[]: # are there annot for this example?
                if annot[k]['tokens'][i]['label']==labels[l]:
                    if bool(re.match(r'\W',annot[k]['tokens'][i]['text'].lower()[-1:])) and annot[k]['tokens'][i]['label'] != annot[k]['tokens'][i+1]['label']:
                        out.append(annot[k]['tokens'][i]['text'].lower())
                        #print("k/"+str(k))
        # now add for i=j-1 (i.e. last token in example; by definition this is last token in label)
        if annot[k]['spans']!=[]: # are there annot for this example?
            if annot[k]['tokens'][j-1]['label']==labels[l]:
                if bool(re.match(r'\W',annot[k]['tokens'][j-1]['text'].lower()[-1:])):
                    out.append(annot[k]['tokens'][j-1]['text'].lower())
    out_count = Counter(out)
    print(out_count)

Counter()
Counter()
Counter()
Counter()
Counter()
Counter()
Counter()
Counter({'.': 1})
Counter()
Counter()
Counter()


In [27]:
# print(annot[0]['tokens']) # DON'T RUN - BEFORE

[{'text': 'DORNBIRN', 'start': 0, 'end': 8, 'id': 0, 'ws': True, 'label': 'ORT'}, {'text': 'In', 'start': 9, 'end': 11, 'id': 1, 'ws': True, 'label': 'O'}, {'text': 'der', 'start': 12, 'end': 15, 'id': 2, 'ws': True, 'label': 'O'}, {'text': 'Schulgasse', 'start': 16, 'end': 26, 'id': 3, 'ws': True, 'label': 'STRASSE'}, {'text': 'in', 'start': 27, 'end': 29, 'id': 4, 'ws': True, 'label': 'O'}, {'text': 'Dornbirn', 'start': 30, 'end': 38, 'id': 5, 'ws': True, 'label': 'ORT'}, {'text': 'hat', 'start': 39, 'end': 42, 'id': 6, 'ws': True, 'label': 'O'}, {'text': 'eine', 'start': 43, 'end': 47, 'id': 7, 'ws': True, 'label': 'O'}, {'text': '71,93', 'start': 48, 'end': 53, 'id': 8, 'ws': True, 'label': 'FLAECHE'}, {'text': 'Quadratmeter', 'start': 54, 'end': 66, 'id': 9, 'ws': True, 'label': 'O'}, {'text': 'große', 'start': 67, 'end': 72, 'id': 10, 'ws': True, 'label': 'O'}, {'text': 'Wohnung', 'start': 73, 'end': 80, 'id': 11, 'ws': True, 'label': 'IMMO_TYP'}, {'text': 'für', 'start': 81, 'en

In [202]:
print(annot[0]['tokens']) # AFTER

[{'text': 'DORNBIRN', 'start': 0, 'end': 8, 'id': 0, 'ws': True, 'label': 'ORT'}, {'text': 'In', 'start': 9, 'end': 11, 'id': 1, 'ws': True, 'label': 'O'}, {'text': 'der', 'start': 12, 'end': 15, 'id': 2, 'ws': True, 'label': 'O'}, {'text': 'Schulgasse', 'start': 16, 'end': 26, 'id': 3, 'ws': True, 'label': 'STRASSE'}, {'text': 'in', 'start': 27, 'end': 29, 'id': 4, 'ws': True, 'label': 'O'}, {'text': 'Dornbirn', 'start': 30, 'end': 38, 'id': 5, 'ws': True, 'label': 'ORT'}, {'text': 'hat', 'start': 39, 'end': 42, 'id': 6, 'ws': True, 'label': 'O'}, {'text': 'eine', 'start': 43, 'end': 47, 'id': 7, 'ws': True, 'label': 'O'}, {'text': '71,93', 'start': 48, 'end': 53, 'id': 8, 'ws': True, 'label': 'FLAECHE'}, {'text': 'Quadratmeter', 'start': 54, 'end': 66, 'id': 9, 'ws': True, 'label': 'O'}, {'text': 'große', 'start': 67, 'end': 72, 'id': 10, 'ws': True, 'label': 'O'}, {'text': 'Wohnung', 'start': 73, 'end': 80, 'id': 11, 'ws': True, 'label': 'IMMO_TYP'}, {'text': 'für', 'start': 81, 'en

In [203]:
# and another example
print(annot[1]['tokens'])

[{'text': 'FELDKIRCH', 'start': 0, 'end': 9, 'id': 0, 'ws': True, 'label': 'ORT'}, {'text': 'Im', 'start': 10, 'end': 12, 'id': 1, 'ws': True, 'label': 'O'}, {'text': 'Altenreuteweg', 'start': 13, 'end': 26, 'id': 2, 'ws': True, 'label': 'STRASSE'}, {'text': 'in', 'start': 27, 'end': 29, 'id': 3, 'ws': True, 'label': 'O'}, {'text': 'Feldkirch', 'start': 30, 'end': 39, 'id': 4, 'ws': True, 'label': 'ORT'}, {'text': 'hat', 'start': 40, 'end': 43, 'id': 5, 'ws': True, 'label': 'O'}, {'text': 'eine', 'start': 44, 'end': 48, 'id': 6, 'ws': True, 'label': 'O'}, {'text': '100,67', 'start': 49, 'end': 55, 'id': 7, 'ws': True, 'label': 'FLAECHE'}, {'text': 'Quadratmeter', 'start': 56, 'end': 68, 'id': 8, 'ws': True, 'label': 'O'}, {'text': 'große', 'start': 69, 'end': 74, 'id': 9, 'ws': True, 'label': 'O'}, {'text': 'Wohnung', 'start': 75, 'end': 82, 'id': 10, 'ws': True, 'label': 'IMMO_TYP'}, {'text': 'für', 'start': 83, 'end': 86, 'id': 11, 'ws': True, 'label': 'O'}, {'text': 'einen', 'start'

In [204]:
# check if id's have been adjusted in line with items that were added to annot[]['tokens']
for i in range(0,len(annot)):
    jMax=0
    for j in range(0,len(annot[i]['tokens'])):
        if annot[i]['tokens'][j]['id']>jMax:
            jMax=annot[i]['tokens'][j]['id']
    if len(annot[i]['tokens']) != jMax+1:
        print(i)

In [205]:
# output new annotation file
with open('annotations2.jsonl', 'w') as outfile:
    for entry in annot:
        json.dump(entry, outfile)
        outfile.write('\n')

In [133]:
# CHECK: read back ... OK
with open('annotations2.jsonl') as jsonl_file:
    lines = jsonl_file.readlines()
annot2 = [json.loads(line) for line in lines]