## Inpsecting the annotations.jsonl file

In [1]:
import sklearn
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import scipy.stats
import eli5
import json 
#
with open("../annotations.jsonl") as jsonl_file:
    lines = jsonl_file.readlines()
annot = [json.loads(line) for line in lines]
print("instances:\n{}".format(len(annot)))
keys = [key for key in annot[0].keys()]
print("\nall keys:\n{}".format(keys))
key_keys = ["text", "spans", "tokens"]
print("\nimportant keys:\n{}".format(key_keys))
print("\nexample text:\n{}".format(annot[0]["text"]))
n_examples = 5
print("\n{} example spans:".format(n_examples))
for span in annot[0]["spans"][:n_examples]:
    print("{}".format(span))
print("\n{} example tokens:".format(n_examples))
for token in annot[0]["tokens"][:n_examples]:
    print("{}".format(token))

instances:
140

all keys:
['text', 'meta', '_input_hash', '_task_hash', 'spans', 'tokens', '_view_id', 'answer', '_timestamp']

important keys:
['text', 'spans', 'tokens']

example text:
DORNBIRN In der Schulgasse in Dornbirn hat eine 71,93 Quadratmeter große Wohnung für einen Quadratmeterpreis von 5533,71 Euro den Besitzer gewechselt. Dieser beinhaltet auch einen Pkw-Abstellplatz. Käufer der Wohnung mit 9,86 Quadratmetern Terrasse ist die ValLiLean Beteiligungs- und Immobilienverwaltungs GmbH. Beim Verkäufer handelt es sich um die Karrenblick Projekt GmbH.  Der Kaufpreis liegt bei 398.040 Euro. Unterzeichnet wurde der Kaufvertrag am 18. September. Die Verbücherung datiert mit Oktober 2020.

5 example spans:
{'text': 'DORNBIRN', 'start': 0, 'end': 8, 'pattern': 2069086582, 'token_start': 0, 'token_end': 0, 'label': 'ORT'}
{'start': 16, 'end': 26, 'token_start': 3, 'token_end': 3, 'label': 'STRASSE'}
{'text': 'Dornbirn', 'start': 30, 'end': 38, 'pattern': 2069086582, 'token_start': 5, '

In [2]:
# def entityAndLabel_j_in_text_i(i, j, annots, verbose=False):
#     instance = annots[i]
#     text = instance["text"]
#     span = instance["spans"][j]
#     entity = text[span["start"]:span["end"]]
#     label = span["label"]
#     if verbose:
#         print("text:\n{}\n\nspan:\n{}\n\nentity:\n{}\n\nlabel:\n{}\n".format(text, span, entity, label))
#     return entity, label
#
# try:
#     print(entityAndLabel_j_in_text_i(50, 10, annot))
# except:
#     IndexError

('Juni 2020.', 'DATUM_VERBUECHERUNG')


In [3]:
def getLabel(dictList, idx):
    result = "O"
    for dict_i in dictList:
        idx_0, idx_1 = dict_i['start'], dict_i['end']
        if (idx_0<=idx) and (idx<=idx_1):
            result = dict_i["label"]
    return result 

myDictList = [
    {'start':0, 'end':3, 'label': 'ORT'},
    {'start':5, 'end':8, 'label': 'ORT2'},
    {'start':10, 'end':13, 'label': 'ORT3'}
]

for i in range(16):
    pos = i-1
    print("label for position {}:\t{}".format(pos, getLabel(myDictList, i-1)))

label for position -1:	O
label for position 0:	ORT
label for position 1:	ORT
label for position 2:	ORT
label for position 3:	ORT
label for position 4:	O
label for position 5:	ORT2
label for position 6:	ORT2
label for position 7:	ORT2
label for position 8:	ORT2
label for position 9:	O
label for position 10:	ORT3
label for position 11:	ORT3
label for position 12:	ORT3
label for position 13:	ORT3
label for position 14:	O


In [4]:
check = True
for j in range(len(annot)): # loop over instances
    a = annot[j]            # instance j
    spans = a['spans']      # list of annotation dicts
    toks = a['tokens']      # list of token dicts
    for i in range(len(toks)):                                 # loop over token dicts
        toks[i]['label'] = getLabel(spans, toks[i]['start'])   # assign label from span (if exists, otherwise "O")
        if toks[i]['label'] != "O":                            # if the token represents an entity ...
            if i==0:
                toks[i]['label'] = "B-"+toks[i]['label']       # ... and is the first in the text => "B-" + label
            else:                                              # not first token in text:
                if (toks[i]['label']==toks[i-1]['label'][2:]):
                    toks[i]['label'] = "I-"+toks[i]['label']   # > but same label as previous token => "I-" + label
                else:
                    toks[i]['label'] = "B-"+toks[i]['label']   # > but first token of an entity => "B-" + label
    annot[j]['tokens'] = toks

words_n = 3
for i in range(2):
    print("Token dictionaries for the last {} words of instance {}".format(words_n, i))
    ann = annot[i]
    for tok in ann["tokens"][:words_n]:
        print(tok)
# O => trivial class (no entity)
# B => Entity or leading token of an entity
# I => subsequent token of an entity

Token dictionaries for the last 3 words of instance 0
{'text': 'DORNBIRN', 'start': 0, 'end': 8, 'id': 0, 'ws': True, 'label': 'B-ORT'}
{'text': 'In', 'start': 9, 'end': 11, 'id': 1, 'ws': True, 'label': 'O'}
{'text': 'der', 'start': 12, 'end': 15, 'id': 2, 'ws': True, 'label': 'O'}
Token dictionaries for the last 3 words of instance 1
{'text': 'FELDKIRCH', 'start': 0, 'end': 9, 'id': 0, 'ws': True, 'label': 'B-ORT'}
{'text': 'Im', 'start': 10, 'end': 12, 'id': 1, 'ws': True, 'label': 'O'}
{'text': 'Altenreuteweg', 'start': 13, 'end': 26, 'id': 2, 'ws': True, 'label': 'B-STRASSE'}


In [5]:
# ToDo
# > undertand the main stuff below
# > write a function to list tokens and labels next to each other
# > when done with everything, do the following
# >> document all this nicely
# >> acknowledge Vasco
# >> point out reading list and links

In [6]:
# now convert annotation tokens into list (sentences) of lists (tokens) format for sklearn_crfsuite.CRF
train_sents=[] 
for j in range(0,len(annot)):
    a = annot[j]['tokens']
    train_sentence = []
    for i in range(0,len(a)):
        if 'label' in a[i]: # only add element if this sample sentence has been labelled 
            token_element = (a[i]['text'],a[i]['label'])
            train_sentence.append(token_element)
    train_sents.append(train_sentence)

train_sents

[[('DORNBIRN', 'B-ORT'),
  ('In', 'O'),
  ('der', 'O'),
  ('Schulgasse', 'B-STRASSE'),
  ('in', 'O'),
  ('Dornbirn', 'B-ORT'),
  ('hat', 'O'),
  ('eine', 'O'),
  ('71,93', 'B-FLAECHE'),
  ('Quadratmeter', 'O'),
  ('große', 'O'),
  ('Wohnung', 'B-IMMO_TYP'),
  ('für', 'O'),
  ('einen', 'O'),
  ('Quadratmeterpreis', 'O'),
  ('von', 'O'),
  ('5533,71', 'B-GESAMTPREIS'),
  ('Euro', 'O'),
  ('den', 'O'),
  ('Besitzer', 'O'),
  ('gewechselt', 'O'),
  ('.', 'O'),
  ('Dieser', 'O'),
  ('beinhaltet', 'O'),
  ('auch', 'O'),
  ('einen', 'O'),
  ('Pkw-Abstellplatz', 'O'),
  ('.', 'O'),
  ('Käufer', 'O'),
  ('der', 'O'),
  ('Wohnung', 'O'),
  ('mit', 'O'),
  ('9,86', 'B-TERRASSENGROESSE'),
  ('Quadratmetern', 'O'),
  ('Terrasse', 'O'),
  ('ist', 'O'),
  ('die', 'O'),
  ('ValLiLean', 'B-KAEUFER'),
  ('Beteiligungs-', 'I-KAEUFER'),
  ('und', 'I-KAEUFER'),
  ('Immobilienverwaltungs', 'I-KAEUFER'),
  ('GmbH.', 'I-KAEUFER'),
  ('Beim', 'O'),
  ('Verkäufer', 'O'),
  ('handelt', 'O'),
  ('es', 'O'),
  ('s

In [7]:
def word2features(sent, i):
    word = sent[i][0]
    #postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        #'postag': postag, # don't have PoS data
        #'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        #postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            #'-1:postag': postag1,
            #'-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        #postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            #'+1:postag': postag1,
            #'+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
   # return [label for token, postag, label in sent]
    return [label for token, label in sent]

def sent2tokens(sent):
   # return [token for token, postag, label in sent]
    return [token for token, label in sent]

In [8]:
sent2features(train_sents[0])[0]

{'bias': 1.0,
 'word.lower()': 'dornbirn',
 'word[-3:]': 'IRN',
 'word[-2:]': 'RN',
 'word.isupper()': True,
 'word.istitle()': False,
 'word.isdigit()': False,
 'BOS': True,
 '+1:word.lower()': 'in',
 '+1:word.istitle()': True,
 '+1:word.isupper()': False}

In [9]:
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]
print(y_train[0])
print(len(X_train))
print(len(y_train))

['B-ORT', 'O', 'O', 'B-STRASSE', 'O', 'B-ORT', 'O', 'O', 'B-FLAECHE', 'O', 'O', 'B-IMMO_TYP', 'O', 'O', 'O', 'O', 'B-GESAMTPREIS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-TERRASSENGROESSE', 'O', 'O', 'O', 'O', 'B-KAEUFER', 'I-KAEUFER', 'I-KAEUFER', 'I-KAEUFER', 'I-KAEUFER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-VERKAEUFER', 'I-VERKAEUFER', 'I-VERKAEUFER', 'O', 'O', 'O', 'O', 'O', 'B-GESAMTPREIS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DATUM_VERTRAG', 'I-DATUM_VERTRAG', 'I-DATUM_VERTRAG', 'O', 'O', 'O', 'O', 'B-DATUM_VERBUECHERUNG', 'I-DATUM_VERBUECHERUNG']
140
140


In [10]:
%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train[:120], y_train[:120])

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 3.81 µs




CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [11]:
eli5.show_weights(crf, top=30)

From \ To,O,B-DATUM_VERBUECHERUNG,I-DATUM_VERBUECHERUNG,B-DATUM_VERTRAG,I-DATUM_VERTRAG,B-FLAECHE,B-GESAMTPREIS,I-GESAMTPREIS,B-IMMO_TYP,I-IMMO_TYP,B-KAEUFER,I-KAEUFER,B-ORT,I-ORT,B-QMPREIS,B-STRASSE,I-STRASSE,B-TERRASSENGROESSE,B-VERKAEUFER,I-VERKAEUFER
O,2.552,1.723,-1.633,1.854,-1.971,2.067,1.294,-1.043,2.03,-3.055,1.855,-2.134,0.941,-1.208,1.395,2.04,-2.133,0.79,1.887,-2.342
B-DATUM_VERBUECHERUNG,-1.413,-0.561,4.746,0.0,0.0,0.0,0.0,0.0,0.0,-0.052,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I-DATUM_VERBUECHERUNG,-0.576,0.0,1.146,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-DATUM_VERTRAG,-0.859,0.0,0.0,-0.525,4.573,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I-DATUM_VERTRAG,0.208,0.0,0.0,0.0,4.058,0.0,0.0,0.0,0.0,0.0,0.0,-0.046,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.172
B-FLAECHE,1.616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,-0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-GESAMTPREIS,1.376,0.0,-0.0,0.0,0.0,0.0,0.0,2.235,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I-GESAMTPREIS,0.484,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-IMMO_TYP,0.381,0.0,-0.247,0.0,-0.094,0.0,0.0,0.0,-0.163,3.224,0.0,-0.332,0.0,0.0,0.0,0.296,0.0,0.0,0.0,-0.341
I-IMMO_TYP,0.133,0.0,0.0,0.0,0.0,-0.091,0.0,0.0,0.0,4.096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.056,0.0,0.0

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0,Unnamed: 17_level_0,Unnamed: 18_level_0,Unnamed: 19_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5,Unnamed: 17_level_5,Unnamed: 18_level_5,Unnamed: 19_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6,Unnamed: 17_level_6,Unnamed: 18_level_6,Unnamed: 19_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7,Unnamed: 17_level_7,Unnamed: 18_level_7,Unnamed: 19_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8,Unnamed: 17_level_8,Unnamed: 18_level_8,Unnamed: 19_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9,Unnamed: 14_level_9,Unnamed: 15_level_9,Unnamed: 16_level_9,Unnamed: 17_level_9,Unnamed: 18_level_9,Unnamed: 19_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10,Unnamed: 15_level_10,Unnamed: 16_level_10,Unnamed: 17_level_10,Unnamed: 18_level_10,Unnamed: 19_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11,Unnamed: 14_level_11,Unnamed: 15_level_11,Unnamed: 16_level_11,Unnamed: 17_level_11,Unnamed: 18_level_11,Unnamed: 19_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12,Unnamed: 14_level_12,Unnamed: 15_level_12,Unnamed: 16_level_12,Unnamed: 17_level_12,Unnamed: 18_level_12,Unnamed: 19_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13,Unnamed: 14_level_13,Unnamed: 15_level_13,Unnamed: 16_level_13,Unnamed: 17_level_13,Unnamed: 18_level_13,Unnamed: 19_level_13
Weight?,Feature,Unnamed: 2_level_14,Unnamed: 3_level_14,Unnamed: 4_level_14,Unnamed: 5_level_14,Unnamed: 6_level_14,Unnamed: 7_level_14,Unnamed: 8_level_14,Unnamed: 9_level_14,Unnamed: 10_level_14,Unnamed: 11_level_14,Unnamed: 12_level_14,Unnamed: 13_level_14,Unnamed: 14_level_14,Unnamed: 15_level_14,Unnamed: 16_level_14,Unnamed: 17_level_14,Unnamed: 18_level_14,Unnamed: 19_level_14
Weight?,Feature,Unnamed: 2_level_15,Unnamed: 3_level_15,Unnamed: 4_level_15,Unnamed: 5_level_15,Unnamed: 6_level_15,Unnamed: 7_level_15,Unnamed: 8_level_15,Unnamed: 9_level_15,Unnamed: 10_level_15,Unnamed: 11_level_15,Unnamed: 12_level_15,Unnamed: 13_level_15,Unnamed: 14_level_15,Unnamed: 15_level_15,Unnamed: 16_level_15,Unnamed: 17_level_15,Unnamed: 18_level_15,Unnamed: 19_level_15
Weight?,Feature,Unnamed: 2_level_16,Unnamed: 3_level_16,Unnamed: 4_level_16,Unnamed: 5_level_16,Unnamed: 6_level_16,Unnamed: 7_level_16,Unnamed: 8_level_16,Unnamed: 9_level_16,Unnamed: 10_level_16,Unnamed: 11_level_16,Unnamed: 12_level_16,Unnamed: 13_level_16,Unnamed: 14_level_16,Unnamed: 15_level_16,Unnamed: 16_level_16,Unnamed: 17_level_16,Unnamed: 18_level_16,Unnamed: 19_level_16
Weight?,Feature,Unnamed: 2_level_17,Unnamed: 3_level_17,Unnamed: 4_level_17,Unnamed: 5_level_17,Unnamed: 6_level_17,Unnamed: 7_level_17,Unnamed: 8_level_17,Unnamed: 9_level_17,Unnamed: 10_level_17,Unnamed: 11_level_17,Unnamed: 12_level_17,Unnamed: 13_level_17,Unnamed: 14_level_17,Unnamed: 15_level_17,Unnamed: 16_level_17,Unnamed: 17_level_17,Unnamed: 18_level_17,Unnamed: 19_level_17
Weight?,Feature,Unnamed: 2_level_18,Unnamed: 3_level_18,Unnamed: 4_level_18,Unnamed: 5_level_18,Unnamed: 6_level_18,Unnamed: 7_level_18,Unnamed: 8_level_18,Unnamed: 9_level_18,Unnamed: 10_level_18,Unnamed: 11_level_18,Unnamed: 12_level_18,Unnamed: 13_level_18,Unnamed: 14_level_18,Unnamed: 15_level_18,Unnamed: 16_level_18,Unnamed: 17_level_18,Unnamed: 18_level_18,Unnamed: 19_level_18
Weight?,Feature,Unnamed: 2_level_19,Unnamed: 3_level_19,Unnamed: 4_level_19,Unnamed: 5_level_19,Unnamed: 6_level_19,Unnamed: 7_level_19,Unnamed: 8_level_19,Unnamed: 9_level_19,Unnamed: 10_level_19,Unnamed: 11_level_19,Unnamed: 12_level_19,Unnamed: 13_level_19,Unnamed: 14_level_19,Unnamed: 15_level_19,Unnamed: 16_level_19,Unnamed: 17_level_19,Unnamed: 18_level_19,Unnamed: 19_level_19
+3.613,bias,,,,,,,,,,,,,,,,,,
+2.919,word.lower():baumgartenstraße,,,,,,,,,,,,,,,,,,
+2.622,-1:word.lower():gmbh.,,,,,,,,,,,,,,,,,,
+2.439,-1:word.lower():.,,,,,,,,,,,,,,,,,,
+2.121,word.lower():in,,,,,,,,,,,,,,,,,,
+2.081,word[-3:]:fer,,,,,,,,,,,,,,,,,,
+2.025,word.lower():quadratmeter,,,,,,,,,,,,,,,,,,
+2.015,"word.lower():153,29",,,,,,,,,,,,,,,,,,
+1.956,word.lower():fläche,,,,,,,,,,,,,,,,,,
+1.884,word[-2:]:um,,,,,,,,,,,,,,,,,,

Weight?,Feature
+3.613,bias
+2.919,word.lower():baumgartenstraße
+2.622,-1:word.lower():gmbh.
+2.439,-1:word.lower():.
+2.121,word.lower():in
+2.081,word[-3:]:fer
+2.025,word.lower():quadratmeter
+2.015,"word.lower():153,29"
+1.956,word.lower():fläche
+1.884,word[-2:]:um

Weight?,Feature
+2.207,-1:word.lower():mit
+1.684,-1:word.lower():im
+0.947,+1:word.lower():oktober
+0.941,+1:word.lower():september
+0.862,word.lower():juli
+0.862,word[-2:]:li
+0.862,word[-3:]:uli
+0.807,-1:word.lower():datiert
+0.721,word.lower():mit
+0.721,word[-3:]:mit

Weight?,Feature
+2.706,EOS
+1.712,-1:word.istitle()
+1.575,word[-3:]:ber
+1.182,-1:word.lower():juli
+1.046,word.lower():oktober
+1.007,+1:word.lower():2021.
+1.005,-1:word.lower():mit
+0.973,word[-2:]:er
+0.973,word.lower():september
+0.828,word.lower():2021.

Weight?,Feature
+4.651,-1:word.lower():am
+1.104,-1:word.lower():kaufvertrag
+1.026,word.lower():25.04.2019.
+0.900,+1:word.lower():juni
+0.886,+1:word.lower():19.
+0.860,+1:word.lower():november
+0.841,+1:word.lower():2019
+0.827,+1:word.lower():die
+0.757,+1:word.lower():23.
+0.715,+1:word.lower():25.

Weight?,Feature
+2.804,+1:word.lower():die
+1.375,+1:word.lower():februar
+1.251,+1:word.lower():dezember
+1.184,-1:word.lower():april
+1.094,word[-3:]:ber
+1.031,+1:word.lower():august
+0.976,+1:word.lower():unterzeichnet
+0.879,word.lower():19.
+0.852,word[-2:]:.
+0.852,word.lower():.

Weight?,Feature
+4.149,+1:word.lower():quadratmeter
+2.721,-1:word.lower():von
+2.573,word.isdigit()
+2.266,-1:word.lower():eine
+1.992,+1:word.lower():quadratmetern
+1.494,+1:word.lower():einen
+1.360,-1:word.lower():großen
+1.147,word.lower():553
+1.147,word[-3:]:553
+1.103,word[-2:]:53

Weight?,Feature
+4.523,+1:word.lower():euro
+2.749,-1:word.lower():um
+2.724,word[-2:]:00
+1.997,+1:word.lower():millionen
+1.978,word[-2:]:40
+1.917,-1:word.lower():bzw.
+1.876,-1:word.lower():für
+1.802,word[-2:]:9
+1.802,word[-3:]:9
+1.802,word.lower():9

Weight?,Feature
2.331,word.lower():millionen
1.624,word[-3:]:nen
1.421,word[-2:]:en
1.258,+1:word.lower():euro
0.515,"-1:word.lower():9,5"
0.515,+1:word.lower():erwarb
0.449,"-1:word.lower():9,7"
0.325,word.istitle()
0.227,+1:word.istitle()
0.194,"-1:word.lower():1,77"

Weight?,Feature
+3.512,word.lower():grundstücksfläche
+3.323,-1:word.lower():große
+2.884,word.lower():wohnung
+2.648,-1:word.lower():dazugehörige
+1.770,+1:word.lower():um
+1.722,-1:word.lower():ein
+1.667,+1:word.lower():als
+1.636,+1:word.lower():grundstück
+1.584,word.istitle()
+1.467,word[-3:]:aus

Weight?,Feature
+1.841,-1:word.lower():bodens
+1.790,+1:word.lower():verkäufer
+1.764,+1:word.lower():freihaltefläche-widmung
+1.681,-1:word.lower():einer
+1.661,word.lower():freihaltefläche-widmung
+1.505,+1:word.lower():mit
+1.449,word[-3:]:ung
+1.382,word[-2:]:ng
+1.374,-1:word.lower():gebäudes
+1.306,-1:word.lower():benachbarte

Weight?,Feature
+1.905,-1:word.lower():die
+1.461,-1:word.lower():sind
+1.238,word.lower():privatperson
+1.238,word[-3:]:son
+1.127,-1:word.lower():grundbuch
+1.055,+1:word.lower():schmelzenbach
+1.033,word[-3:]:ler
+1.004,+1:word.lower():breitenberg
+0.966,-1:word.lower():ist
+0.921,-1:word.lower():ein

Weight?,Feature
+3.356,+1:word.lower():beim
+1.904,-1:word.lower():privatpersonen
+1.283,+1:word.lower():das
+1.178,word[-2:]:H.
+1.178,word[-3:]:bH.
+1.157,-1:word.istitle()
+1.112,-1:word.lower():privatperson
+1.071,word.lower():gmbh.
+1.051,-1:word.lower():mehrere
+1.032,-1:word.isupper()

Weight?,Feature
+4.410,-1:word.lower():in
+3.638,BOS
+3.616,word.isupper()
+1.791,+1:word.lower():hat
+1.619,word.lower():dornbirn
+1.599,word[-2:]:nz
+1.569,+1:word.lower():(
+1.515,word.lower():bregenz
+1.510,word[-3:]:enz
+1.507,+1:word.lower():wurde

Weight?,Feature
1.793,word[-2:]::
1.793,word[-3:]::
1.793,word.lower()::
0.717,-1:word.istitle()
0.114,-1:word.lower():nenzing
-0.126,bias
-0.405,word.istitle()
-0.728,+1:word.istitle()

Weight?,Feature
+6.284,+1:word.lower():euro
+3.181,-1:word.lower():beträgt
+2.051,-1:word.lower():von
+1.806,-1:word.lower():bei
+1.174,word.isdigit()
+0.873,+1:word.istitle()
+0.600,word.lower():900
+0.570,"word.lower():2927,64"
+0.555,"word[-3:]:,64"
+0.555,word[-2:]:64

Weight?,Feature
+2.832,+1:word.lower():in
+2.437,-1:word.lower():der
+2.266,-1:word.lower():im
+1.900,word[-2:]:eg
+1.836,-1:word.lower():bereich
+1.741,word[-3:]:sse
+1.640,word[-3:]:weg
+1.630,word.lower():in
+1.472,+1:word.lower():hat
+1.461,word[-3:]:aße

Weight?,Feature
+2.360,+1:word.lower():in
+1.584,word.isdigit()
+1.522,word.lower():der
+1.507,-1:word.lower():der
+1.432,word[-3:]:der
+1.260,+1:word.lower():bleiche
+1.227,word.lower():bleiche
+1.090,word.istitle()
+0.903,word.lower():nollen
+0.891,word[-3:]:len

Weight?,Feature
+3.093,+1:word.lower():quadratmetern
+2.699,-1:word.lower():mit
+2.019,+1:word.lower():quadratmeter
+1.576,"word.lower():14,29"
+1.393,-1:word.lower():und
+1.281,"word.lower():137,49"
+1.205,word[-2:]:6
+1.205,word.lower():6
+1.205,word[-3:]:6
+1.124,-1:word.lower():einer

Weight?,Feature
+2.275,-1:word.lower():die
+1.671,-1:word.lower():durch
+1.632,word[-2:]:on
+1.302,word[-3:]:son
+1.302,word.lower():privatperson
+1.285,+1:word.lower():wohn-form.at
+1.105,+1:word.lower():um
+1.082,word.lower():privatpersonen
+0.971,+1:word.lower():greif
+0.954,-1:word.lower():um

Weight?,Feature
+2.614,+1:word.lower():käufer
+2.587,+1:word.lower():
+1.782,+1:word.lower():um
+1.454,-1:word.istitle()
+1.042,word.lower():gmbh.
+1.033,word[-2:]:H.
+1.033,word[-3:]:bH.
+0.984,+1:word.lower():gastronomie
+0.984,word[-3:]:.at
+0.984,word.lower():wohn-form.at


In [12]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-ORT',
 'B-STRASSE',
 'B-FLAECHE',
 'B-IMMO_TYP',
 'B-GESAMTPREIS',
 'B-TERRASSENGROESSE',
 'B-KAEUFER',
 'I-KAEUFER',
 'B-VERKAEUFER',
 'I-VERKAEUFER',
 'B-DATUM_VERTRAG',
 'I-DATUM_VERTRAG',
 'B-DATUM_VERBUECHERUNG',
 'I-DATUM_VERBUECHERUNG',
 'B-QMPREIS',
 'I-IMMO_TYP',
 'I-STRASSE',
 'I-GESAMTPREIS',
 'I-ORT']

In [13]:
X_test = X_train[120:]
y_test = y_train[120:]

In [14]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)

  _warn_prf(


0.9002400202334798

In [15]:
# group B and I results
sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
print(metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))

                       precision    recall  f1-score   support

B-DATUM_VERBUECHERUNG      1.000     1.000     1.000        15
I-DATUM_VERBUECHERUNG      1.000     1.000     1.000        15
      B-DATUM_VERTRAG      0.933     0.933     0.933        15
      I-DATUM_VERTRAG      0.931     0.964     0.947        28
            B-FLAECHE      0.944     1.000     0.971        17
        B-GESAMTPREIS      1.000     1.000     1.000        13
        I-GESAMTPREIS      0.000     0.000     0.000         0
           B-IMMO_TYP      0.895     0.773     0.829        22
           I-IMMO_TYP      0.000     0.000     0.000         1
            B-KAEUFER      0.824     0.933     0.875        15
            I-KAEUFER      0.778     1.000     0.875        21
                B-ORT      0.861     0.912     0.886        34
                I-ORT      0.000     0.000     0.000         1
            B-QMPREIS      1.000     1.000     1.000        11
            B-STRASSE      0.917     0.786     0.846  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
