## Inpsecting the annotations.jsonl file

add hyperparameter search!

In [1]:
import sklearn
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
import scipy.stats
import eli5
import json

with open("../annotations.jsonl") as jsonl_file:
    lines = jsonl_file.readlines()
annot = [json.loads(line) for line in lines]
print("instances:\n{}".format(len(annot)))
keys = [key for key in annot[0].keys()]
print("\nall keys:\n{}".format(keys))
key_keys = ["text", "spans", "tokens"]
print("\nimportant keys:\n{}".format(key_keys))
print("\nexample text:\n{}".format(annot[0]["text"]))
n_examples = 5
print("\n{} example spans:".format(n_examples))
for span in annot[0]["spans"][:n_examples]:
    print("{}".format(span))
print("\n{} example tokens:".format(n_examples))
for token in annot[0]["tokens"][:n_examples]:
    print("{}".format(token))

instances:
140

all keys:
['text', 'meta', '_input_hash', '_task_hash', 'spans', 'tokens', '_view_id', 'answer', '_timestamp']

important keys:
['text', 'spans', 'tokens']

example text:
DORNBIRN In der Schulgasse in Dornbirn hat eine 71,93 Quadratmeter große Wohnung für einen Quadratmeterpreis von 5533,71 Euro den Besitzer gewechselt. Dieser beinhaltet auch einen Pkw-Abstellplatz. Käufer der Wohnung mit 9,86 Quadratmetern Terrasse ist die ValLiLean Beteiligungs- und Immobilienverwaltungs GmbH. Beim Verkäufer handelt es sich um die Karrenblick Projekt GmbH.  Der Kaufpreis liegt bei 398.040 Euro. Unterzeichnet wurde der Kaufvertrag am 18. September. Die Verbücherung datiert mit Oktober 2020.

5 example spans:
{'text': 'DORNBIRN', 'start': 0, 'end': 8, 'pattern': 2069086582, 'token_start': 0, 'token_end': 0, 'label': 'ORT'}
{'start': 16, 'end': 26, 'token_start': 3, 'token_end': 3, 'label': 'STRASSE'}
{'text': 'Dornbirn', 'start': 30, 'end': 38, 'pattern': 2069086582, 'token_start': 5, '

In [2]:
def getLabel(dictList, idx):
    result = "O"
    for dict_i in dictList:
        idx_0, idx_1 = dict_i['start'], dict_i['end']
        if (idx_0<=idx) and (idx<=idx_1):
            result = dict_i["label"]
    return result 

myDictList = [
    {'start':0, 'end':3, 'label': 'ORT'},
    {'start':5, 'end':8, 'label': 'ORT2'},
    {'start':10, 'end':13, 'label': 'ORT3'}
]

for i in range(16):
    pos = i-1
    print("label for position {}:\t{}".format(pos, getLabel(myDictList, i-1)))

label for position -1:	O
label for position 0:	ORT
label for position 1:	ORT
label for position 2:	ORT
label for position 3:	ORT
label for position 4:	O
label for position 5:	ORT2
label for position 6:	ORT2
label for position 7:	ORT2
label for position 8:	ORT2
label for position 9:	O
label for position 10:	ORT3
label for position 11:	ORT3
label for position 12:	ORT3
label for position 13:	ORT3
label for position 14:	O


In [3]:
check = True
for j in range(len(annot)): # loop over instances
    a = annot[j]            # instance j
    spans = a['spans']      # list of annotation dicts
    toks = a['tokens']      # list of token dicts
    for i in range(len(toks)):                                 # loop over token dicts
        toks[i]['label'] = getLabel(spans, toks[i]['start'])   # assign label from span (if exists, otherwise "O")
        if toks[i]['label'] != "O":                            # if the token represents an entity ...
            if i==0:
                toks[i]['label'] = "B-"+toks[i]['label']       # ... and is the first in the text => "B-" + label
            else:                                              # not first token in text:
                if (toks[i]['label']==toks[i-1]['label'][2:]):
                    toks[i]['label'] = "I-"+toks[i]['label']   # > but same label as previous token => "I-" + label
                else:
                    toks[i]['label'] = "B-"+toks[i]['label']   # > but first token of an entity => "B-" + label
    annot[j]['tokens'] = toks

words_n = 3
for i in range(2):
    print("Token dictionaries for the last {} words of instance {}".format(words_n, i))
    ann = annot[i]
    for tok in ann["tokens"][:words_n]:
        print(tok)
# O => trivial class (no entity)
# B => Entity or leading token of an entity
# I => subsequent token of an entity

Token dictionaries for the last 3 words of instance 0
{'text': 'DORNBIRN', 'start': 0, 'end': 8, 'id': 0, 'ws': True, 'label': 'B-ORT'}
{'text': 'In', 'start': 9, 'end': 11, 'id': 1, 'ws': True, 'label': 'O'}
{'text': 'der', 'start': 12, 'end': 15, 'id': 2, 'ws': True, 'label': 'O'}
Token dictionaries for the last 3 words of instance 1
{'text': 'FELDKIRCH', 'start': 0, 'end': 9, 'id': 0, 'ws': True, 'label': 'B-ORT'}
{'text': 'Im', 'start': 10, 'end': 12, 'id': 1, 'ws': True, 'label': 'O'}
{'text': 'Altenreuteweg', 'start': 13, 'end': 26, 'id': 2, 'ws': True, 'label': 'B-STRASSE'}


In [4]:
# use updated list of instances ["token"] and added ["label"] fields to create a list where each instance is ...
# ... represented by a list of (token, label) pairs
sents=[] 
for annot_i in annot:                  # loop over instances
    toks = annot_i['tokens']           # get tokens list for instance i
    train_sentence = []
    for tok in toks:                   # loop over token dicts
        if 'label' in tok:             # only if the current token has been labelled, ...
            token_element = (tok['text'], tok['label']) # ... create a "text", "label" pair for this token ...
            train_sentence.append(token_element)        # ... and append it to the list
    sents.append(train_sentence) # append the list for that instances to the list for all instances / sentences

# list of lists of pairs (sets): outer list contains instances and inner list contains (token, label) pairs
sents

[[('DORNBIRN', 'B-ORT'),
  ('In', 'O'),
  ('der', 'O'),
  ('Schulgasse', 'B-STRASSE'),
  ('in', 'O'),
  ('Dornbirn', 'B-ORT'),
  ('hat', 'O'),
  ('eine', 'O'),
  ('71,93', 'B-FLAECHE'),
  ('Quadratmeter', 'O'),
  ('große', 'O'),
  ('Wohnung', 'B-IMMO_TYP'),
  ('für', 'O'),
  ('einen', 'O'),
  ('Quadratmeterpreis', 'O'),
  ('von', 'O'),
  ('5533,71', 'B-GESAMTPREIS'),
  ('Euro', 'O'),
  ('den', 'O'),
  ('Besitzer', 'O'),
  ('gewechselt', 'O'),
  ('.', 'O'),
  ('Dieser', 'O'),
  ('beinhaltet', 'O'),
  ('auch', 'O'),
  ('einen', 'O'),
  ('Pkw-Abstellplatz', 'O'),
  ('.', 'O'),
  ('Käufer', 'O'),
  ('der', 'O'),
  ('Wohnung', 'O'),
  ('mit', 'O'),
  ('9,86', 'B-TERRASSENGROESSE'),
  ('Quadratmetern', 'O'),
  ('Terrasse', 'O'),
  ('ist', 'O'),
  ('die', 'O'),
  ('ValLiLean', 'B-KAEUFER'),
  ('Beteiligungs-', 'I-KAEUFER'),
  ('und', 'I-KAEUFER'),
  ('Immobilienverwaltungs', 'I-KAEUFER'),
  ('GmbH.', 'I-KAEUFER'),
  ('Beim', 'O'),
  ('Verkäufer', 'O'),
  ('handelt', 'O'),
  ('es', 'O'),
  ('s

In [5]:
def word2features(sent, i): # receive instance and the index i for the i-th token of the instance
    word = sent[i][0]       # i-th token ("sent" is a list of (token, label) pairs; the label is not used, here)
    
    # dictionary of features for the i-th token
    features = {
        'bias': 1.0,                      # a different bias could be computed here (static 1 is useless)
        'word.lower()': word.lower(),     # token in lowercase
        'word[-3:]': word[-3:],           # last 3 letters
        'word[-2:]': word[-2:],           # last 2 letters
        'word.isupper()': word.isupper(), # True if uppercase else False
        'word.istitle()': word.istitle(), # True if title else False, see...
        # ... https://www.w3schools.com/python/trypython.asp?filename=demo_ref_string_istitle2
        'word.isdigit()': word.isdigit()  # True if digit else False
    }
    if i > 0:
        # add features for the previous token
        word_minus_1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word_minus_1.lower(),
            '-1:word.istitle()': word_minus_1.istitle(),
            '-1:word.isupper()': word_minus_1.isupper(),
        })
    else: # the beginning of the sequence
        features['BOS'] = True

    if i < len(sent)-1:
        # add features for the next token
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else: # the end of the sequence
        features['EOS'] = True

    return features


def sent2features(sent):                                      # pass list of (token, label) pairs
    return [word2features(sent, i) for i in range(len(sent))] # pass list of (token, label) pairs and the index ...
    # ... for a position in that list => obtain dict with token features => return list of such feature dicts

# obtain list of dicts with features for the corresponding tokens
sent2features(sents[0])[:2] 

[{'bias': 1.0,
  'word.lower()': 'dornbirn',
  'word[-3:]': 'IRN',
  'word[-2:]': 'RN',
  'word.isupper()': True,
  'word.istitle()': False,
  'word.isdigit()': False,
  'BOS': True,
  '+1:word.lower()': 'in',
  '+1:word.istitle()': True,
  '+1:word.isupper()': False},
 {'bias': 1.0,
  'word.lower()': 'in',
  'word[-3:]': 'In',
  'word[-2:]': 'In',
  'word.isupper()': False,
  'word.istitle()': True,
  'word.isdigit()': False,
  '-1:word.lower()': 'dornbirn',
  '-1:word.istitle()': False,
  '-1:word.isupper()': True,
  '+1:word.lower()': 'der',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False}]

In [6]:
def sent2labels(sent):   # pass one instance (=list of (token, label) pairs) => return list of labels
    return [label for token, label in sent]

X = [sent2features(s) for s in sents] # list of instances, each a list of tokens represented by a feature dict ...
                                      # ... => list of dictionaries
y = [sent2labels(s) for s in sents]   # list of instances, each a list of tokens represented by a label
print(len(X))
[print(X[0][i]) for i in range(4)]
print("")
print(len(y))
[print(y[0][i]) for i in range(4)]
print("")

140
{'bias': 1.0, 'word.lower()': 'dornbirn', 'word[-3:]': 'IRN', 'word[-2:]': 'RN', 'word.isupper()': True, 'word.istitle()': False, 'word.isdigit()': False, 'BOS': True, '+1:word.lower()': 'in', '+1:word.istitle()': True, '+1:word.isupper()': False}
{'bias': 1.0, 'word.lower()': 'in', 'word[-3:]': 'In', 'word[-2:]': 'In', 'word.isupper()': False, 'word.istitle()': True, 'word.isdigit()': False, '-1:word.lower()': 'dornbirn', '-1:word.istitle()': False, '-1:word.isupper()': True, '+1:word.lower()': 'der', '+1:word.istitle()': False, '+1:word.isupper()': False}
{'bias': 1.0, 'word.lower()': 'der', 'word[-3:]': 'der', 'word[-2:]': 'er', 'word.isupper()': False, 'word.istitle()': False, 'word.isdigit()': False, '-1:word.lower()': 'in', '-1:word.istitle()': True, '-1:word.isupper()': False, '+1:word.lower()': 'schulgasse', '+1:word.istitle()': True, '+1:word.isupper()': False}
{'bias': 1.0, 'word.lower()': 'schulgasse', 'word[-3:]': 'sse', 'word[-2:]': 'se', 'word.isupper()': False, 'word

In [7]:
train_ratio = 0.75
train_test_split = round(0.75*len(X) - 0.5) # -0.5 => floor
train_test_split
X_train = X[:train_test_split]
y_train = y[:train_test_split]
X_test = X[train_test_split:]
y_test = y[train_test_split:]
len(X_train), len(y_train), len(X_test), len(y_test)

(104, 104, 36, 36)

In [8]:
%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 4.05 µs




CRF(algorithm='lbfgs', all_possible_transitions=True, c1=0.1, c2=0.1,
    keep_tempfiles=None, max_iterations=100)

In [9]:
eli5.show_weights(crf, top=30)

From \ To,O,B-DATUM_VERBUECHERUNG,I-DATUM_VERBUECHERUNG,B-DATUM_VERTRAG,I-DATUM_VERTRAG,B-FLAECHE,B-GESAMTPREIS,I-GESAMTPREIS,B-IMMO_TYP,I-IMMO_TYP,B-KAEUFER,I-KAEUFER,B-ORT,I-ORT,B-QMPREIS,B-STRASSE,I-STRASSE,B-TERRASSENGROESSE,B-VERKAEUFER,I-VERKAEUFER
O,2.494,1.673,-1.694,1.814,-2.009,2.083,1.403,-1.036,2.027,-3.114,1.731,-2.081,0.645,-1.113,1.403,1.961,-2.218,0.753,1.89,-2.399
B-DATUM_VERBUECHERUNG,-1.524,-0.465,4.77,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.001
I-DATUM_VERBUECHERUNG,-0.591,0.0,1.132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-DATUM_VERTRAG,-0.824,0.0,0.0,-0.532,4.661,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
I-DATUM_VERTRAG,0.147,0.0,0.0,0.0,3.853,0.0,0.0,0.0,0.0,0.0,0.0,-0.031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.17
B-FLAECHE,1.55,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-GESAMTPREIS,0.972,0.0,0.0,0.0,-0.024,0.0,0.0,2.234,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.076
I-GESAMTPREIS,0.417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-IMMO_TYP,0.288,0.0,-0.237,0.0,-0.145,0.0,0.0,0.0,-0.237,3.264,0.0,-0.347,0.0,0.0,0.0,0.313,-0.032,0.0,0.0,-0.425
I-IMMO_TYP,-0.053,0.0,0.0,0.0,0.0,-0.133,0.0,0.0,-0.013,3.83,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.025,0.0,0.0

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0,Unnamed: 17_level_0,Unnamed: 18_level_0,Unnamed: 19_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4,Unnamed: 17_level_4,Unnamed: 18_level_4,Unnamed: 19_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5,Unnamed: 17_level_5,Unnamed: 18_level_5,Unnamed: 19_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6,Unnamed: 17_level_6,Unnamed: 18_level_6,Unnamed: 19_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7,Unnamed: 17_level_7,Unnamed: 18_level_7,Unnamed: 19_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8,Unnamed: 17_level_8,Unnamed: 18_level_8,Unnamed: 19_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9,Unnamed: 14_level_9,Unnamed: 15_level_9,Unnamed: 16_level_9,Unnamed: 17_level_9,Unnamed: 18_level_9,Unnamed: 19_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10,Unnamed: 15_level_10,Unnamed: 16_level_10,Unnamed: 17_level_10,Unnamed: 18_level_10,Unnamed: 19_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11,Unnamed: 14_level_11,Unnamed: 15_level_11,Unnamed: 16_level_11,Unnamed: 17_level_11,Unnamed: 18_level_11,Unnamed: 19_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12,Unnamed: 14_level_12,Unnamed: 15_level_12,Unnamed: 16_level_12,Unnamed: 17_level_12,Unnamed: 18_level_12,Unnamed: 19_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13,Unnamed: 14_level_13,Unnamed: 15_level_13,Unnamed: 16_level_13,Unnamed: 17_level_13,Unnamed: 18_level_13,Unnamed: 19_level_13
Weight?,Feature,Unnamed: 2_level_14,Unnamed: 3_level_14,Unnamed: 4_level_14,Unnamed: 5_level_14,Unnamed: 6_level_14,Unnamed: 7_level_14,Unnamed: 8_level_14,Unnamed: 9_level_14,Unnamed: 10_level_14,Unnamed: 11_level_14,Unnamed: 12_level_14,Unnamed: 13_level_14,Unnamed: 14_level_14,Unnamed: 15_level_14,Unnamed: 16_level_14,Unnamed: 17_level_14,Unnamed: 18_level_14,Unnamed: 19_level_14
Weight?,Feature,Unnamed: 2_level_15,Unnamed: 3_level_15,Unnamed: 4_level_15,Unnamed: 5_level_15,Unnamed: 6_level_15,Unnamed: 7_level_15,Unnamed: 8_level_15,Unnamed: 9_level_15,Unnamed: 10_level_15,Unnamed: 11_level_15,Unnamed: 12_level_15,Unnamed: 13_level_15,Unnamed: 14_level_15,Unnamed: 15_level_15,Unnamed: 16_level_15,Unnamed: 17_level_15,Unnamed: 18_level_15,Unnamed: 19_level_15
Weight?,Feature,Unnamed: 2_level_16,Unnamed: 3_level_16,Unnamed: 4_level_16,Unnamed: 5_level_16,Unnamed: 6_level_16,Unnamed: 7_level_16,Unnamed: 8_level_16,Unnamed: 9_level_16,Unnamed: 10_level_16,Unnamed: 11_level_16,Unnamed: 12_level_16,Unnamed: 13_level_16,Unnamed: 14_level_16,Unnamed: 15_level_16,Unnamed: 16_level_16,Unnamed: 17_level_16,Unnamed: 18_level_16,Unnamed: 19_level_16
Weight?,Feature,Unnamed: 2_level_17,Unnamed: 3_level_17,Unnamed: 4_level_17,Unnamed: 5_level_17,Unnamed: 6_level_17,Unnamed: 7_level_17,Unnamed: 8_level_17,Unnamed: 9_level_17,Unnamed: 10_level_17,Unnamed: 11_level_17,Unnamed: 12_level_17,Unnamed: 13_level_17,Unnamed: 14_level_17,Unnamed: 15_level_17,Unnamed: 16_level_17,Unnamed: 17_level_17,Unnamed: 18_level_17,Unnamed: 19_level_17
Weight?,Feature,Unnamed: 2_level_18,Unnamed: 3_level_18,Unnamed: 4_level_18,Unnamed: 5_level_18,Unnamed: 6_level_18,Unnamed: 7_level_18,Unnamed: 8_level_18,Unnamed: 9_level_18,Unnamed: 10_level_18,Unnamed: 11_level_18,Unnamed: 12_level_18,Unnamed: 13_level_18,Unnamed: 14_level_18,Unnamed: 15_level_18,Unnamed: 16_level_18,Unnamed: 17_level_18,Unnamed: 18_level_18,Unnamed: 19_level_18
Weight?,Feature,Unnamed: 2_level_19,Unnamed: 3_level_19,Unnamed: 4_level_19,Unnamed: 5_level_19,Unnamed: 6_level_19,Unnamed: 7_level_19,Unnamed: 8_level_19,Unnamed: 9_level_19,Unnamed: 10_level_19,Unnamed: 11_level_19,Unnamed: 12_level_19,Unnamed: 13_level_19,Unnamed: 14_level_19,Unnamed: 15_level_19,Unnamed: 16_level_19,Unnamed: 17_level_19,Unnamed: 18_level_19,Unnamed: 19_level_19
+3.309,bias,,,,,,,,,,,,,,,,,,
+2.956,word.lower():baumgartenstraße,,,,,,,,,,,,,,,,,,
+2.734,-1:word.lower():gmbh.,,,,,,,,,,,,,,,,,,
+2.721,-1:word.lower():.,,,,,,,,,,,,,,,,,,
+2.405,word.lower():in,,,,,,,,,,,,,,,,,,
+2.128,"word.lower():153,29",,,,,,,,,,,,,,,,,,
+2.105,word.lower():fläche,,,,,,,,,,,,,,,,,,
+1.949,word[-3:]:fer,,,,,,,,,,,,,,,,,,
+1.914,word.lower():quadratmeter,,,,,,,,,,,,,,,,,,
+1.904,word[-2:]:um,,,,,,,,,,,,,,,,,,

Weight?,Feature
+3.309,bias
+2.956,word.lower():baumgartenstraße
+2.734,-1:word.lower():gmbh.
+2.721,-1:word.lower():.
+2.405,word.lower():in
+2.128,"word.lower():153,29"
+2.105,word.lower():fläche
+1.949,word[-3:]:fer
+1.914,word.lower():quadratmeter
+1.904,word[-2:]:um

Weight?,Feature
+1.900,-1:word.lower():mit
+1.654,-1:word.lower():im
+0.967,+1:word.lower():september
+0.946,+1:word.lower():oktober
+0.845,word[-3:]:uli
+0.845,word[-2:]:li
+0.845,word.lower():juli
+0.776,word.istitle()
+0.710,-1:word.lower():datiert
+0.678,+1:word.lower():.

Weight?,Feature
+2.863,EOS
+1.488,word[-3:]:ber
+1.474,-1:word.istitle()
+1.258,-1:word.lower():juli
+1.046,-1:word.lower():mit
+1.009,word.lower():september
+0.996,word.lower():oktober
+0.993,word[-2:]:er
+0.941,+1:word.lower():2021.
+0.749,word.lower():2021.

Weight?,Feature
+4.584,-1:word.lower():am
+1.167,-1:word.lower():kaufvertrag
+1.052,word.lower():25.04.2019.
+0.909,+1:word.lower():juni
+0.881,+1:word.lower():november
+0.877,+1:word.lower():2019
+0.845,+1:word.lower():die
+0.838,+1:word.lower():19.
+0.799,+1:word.lower():23.
+0.723,word[-2:]:6.

Weight?,Feature
+2.910,+1:word.lower():die
+1.336,+1:word.lower():februar
+1.281,+1:word.lower():dezember
+1.171,-1:word.lower():april
+1.111,word[-3:]:ber
+1.007,+1:word.lower():unterzeichnet
+0.966,+1:word.lower():august
+0.940,word.istitle()
+0.846,word[-2:]:er
+0.827,word.lower():19.

Weight?,Feature
+3.924,+1:word.lower():quadratmeter
+2.723,word.isdigit()
+2.682,-1:word.lower():von
+2.280,-1:word.lower():eine
+1.924,+1:word.lower():quadratmetern
+1.434,+1:word.lower():einen
+1.432,-1:word.lower():großen
+1.122,word.lower():553
+1.122,word[-3:]:553
+1.088,word[-2:]:53

Weight?,Feature
+4.779,+1:word.lower():euro
+2.841,word[-2:]:00
+2.724,-1:word.lower():um
+2.068,+1:word.lower():millionen
+1.987,word[-2:]:40
+1.904,word[-2:]:9
+1.904,word[-3:]:9
+1.904,word.lower():9
+1.829,-1:word.lower():für
+1.475,word[-2:]:50

Weight?,Feature
2.282,word.lower():millionen
1.561,word[-3:]:nen
1.324,word[-2:]:en
1.275,+1:word.lower():euro
0.47,"-1:word.lower():9,7"
0.457,+1:word.lower():erwarb
0.457,"-1:word.lower():9,5"
0.428,word.istitle()
0.258,"-1:word.lower():1,77"
0.239,+1:word.istitle()

Weight?,Feature
+3.434,word.lower():grundstücksfläche
+3.385,-1:word.lower():große
+2.679,word.lower():wohnung
+2.618,-1:word.lower():dazugehörige
+1.798,word.istitle()
+1.753,+1:word.lower():als
+1.747,+1:word.lower():um
+1.621,-1:word.lower():ein
+1.603,+1:word.lower():grundstück
+1.524,word[-3:]:aus

Weight?,Feature
+1.999,-1:word.lower():bodens
+1.795,+1:word.lower():freihaltefläche-widmung
+1.700,+1:word.lower():verkäufer
+1.658,-1:word.lower():einer
+1.641,word.lower():freihaltefläche-widmung
+1.529,+1:word.lower():mit
+1.505,word[-3:]:ung
+1.446,word[-2:]:ng
+1.320,-1:word.lower():gebäudes
+1.296,-1:word.lower():benachbarte

Weight?,Feature
+2.166,-1:word.lower():die
+1.514,-1:word.lower():sind
+1.241,word.lower():privatperson
+1.241,word[-3:]:son
+1.108,-1:word.lower():grundbuch
+1.084,+1:word.lower():schmelzenbach
+1.005,+1:word.lower():breitenberg
+0.975,-1:word.lower():ist
+0.969,word[-3:]:ler
+0.918,-1:word.lower():eine

Weight?,Feature
+3.411,+1:word.lower():beim
+1.938,-1:word.lower():privatpersonen
+1.316,-1:word.istitle()
+1.232,+1:word.lower():das
+1.184,-1:word.lower():privatperson
+1.157,word[-3:]:bH.
+1.157,word[-2:]:H.
+1.081,-1:word.lower():mehrere
+1.055,word.lower():gmbh.
+1.021,+1:word.lower():standort

Weight?,Feature
+4.662,-1:word.lower():in
+3.546,word.isupper()
+3.468,BOS
+2.107,+1:word.lower():hat
+1.763,+1:word.lower():(
+1.747,+1:word.lower():wurde
+1.206,word.lower():brand
+1.057,+1:word.lower()::
+1.032,word.istitle()
+1.020,word[-3:]:ken

Weight?,Feature
1.659,word[-2:]::
1.659,word[-3:]::
1.659,word.lower()::
0.577,-1:word.istitle()
0.149,+1:word.lower():60
0.123,-1:word.lower():nenzing
0.123,+1:word.lower():50
0.029,+1:word.lower():54
0.029,-1:word.lower():bürserberg
0.004,+1:word.lower():110

Weight?,Feature
+6.226,+1:word.lower():euro
+3.144,-1:word.lower():beträgt
+2.042,-1:word.lower():von
+1.937,-1:word.lower():bei
+1.122,word.isdigit()
+0.925,+1:word.istitle()
+0.517,"word.lower():2927,64"
+0.498,word[-2:]:64
+0.498,"word[-3:]:,64"
+0.474,word.lower():900

Weight?,Feature
+2.615,+1:word.lower():in
+2.317,-1:word.lower():der
+2.282,-1:word.lower():im
+1.887,word[-2:]:eg
+1.797,-1:word.lower():bereich
+1.754,word[-3:]:sse
+1.708,word[-3:]:weg
+1.601,word[-3:]:aße
+1.560,word.lower():in
+1.497,word[-3:]:hel

Weight?,Feature
+2.154,+1:word.lower():in
+1.586,word.isdigit()
+1.363,-1:word.lower():der
+1.340,word.lower():der
+1.311,word[-3:]:der
+1.209,+1:word.lower():bleiche
+1.202,word.lower():bleiche
+1.124,word.istitle()
+0.887,word.lower():nollen
+0.878,word[-3:]:len

Weight?,Feature
+2.956,+1:word.lower():quadratmetern
+2.633,-1:word.lower():mit
+1.802,+1:word.lower():quadratmeter
+1.496,"word.lower():14,29"
+1.439,-1:word.lower():und
+1.250,"word.lower():137,49"
+1.226,word[-3:]:6
+1.226,word[-2:]:6
+1.226,word.lower():6
+1.122,-1:word.lower():einer

Weight?,Feature
+2.134,-1:word.lower():die
+1.624,-1:word.lower():durch
+1.386,word[-2:]:on
+1.326,+1:word.lower():wohn-form.at
+1.196,word.lower():privatperson
+1.196,word[-3:]:son
+1.191,+1:word.lower():um
+1.025,+1:word.lower():greif
+1.004,-1:word.lower():eine
+0.985,+1:word.lower():hagen

Weight?,Feature
+2.576,+1:word.lower():käufer
+2.369,+1:word.lower():
+1.794,+1:word.lower():um
+1.360,-1:word.istitle()
+1.059,word.lower():gmbh.
+1.035,word[-2:]:H.
+1.035,word[-3:]:bH.
+0.965,word.lower():wohn-form.at
+0.965,word[-3:]:.at
+0.965,+1:word.lower():gastronomie


In [10]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-ORT',
 'B-STRASSE',
 'B-FLAECHE',
 'B-IMMO_TYP',
 'B-GESAMTPREIS',
 'B-TERRASSENGROESSE',
 'B-KAEUFER',
 'I-KAEUFER',
 'B-VERKAEUFER',
 'I-VERKAEUFER',
 'B-DATUM_VERTRAG',
 'I-DATUM_VERTRAG',
 'B-DATUM_VERBUECHERUNG',
 'I-DATUM_VERBUECHERUNG',
 'B-QMPREIS',
 'I-IMMO_TYP',
 'I-STRASSE',
 'I-GESAMTPREIS',
 'I-ORT']

In [11]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)

  _warn_prf(


0.9108921781594944

In [12]:
# group B and I results
sorted_labels = sorted(labels, key=lambda name: (name[1:], name[0]))
print(metrics.flat_classification_report(y_test, y_pred, labels=sorted_labels, digits=3))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

B-DATUM_VERBUECHERUNG      1.000     1.000     1.000        15
I-DATUM_VERBUECHERUNG      1.000     1.000     1.000        15
      B-DATUM_VERTRAG      0.933     0.933     0.933        15
      I-DATUM_VERTRAG      0.931     0.964     0.947        28
            B-FLAECHE      0.944     1.000     0.971        17
        B-GESAMTPREIS      1.000     0.591     0.743        22
        I-GESAMTPREIS      0.000     0.000     0.000         0
           B-IMMO_TYP      0.889     0.727     0.800        22
           I-IMMO_TYP      0.000     0.000     0.000         1
            B-KAEUFER      0.875     0.933     0.903        15
            I-KAEUFER      0.913     1.000     0.955        21
                B-ORT      0.957     0.898     0.926        49
                I-ORT      1.000     0.929     0.963        14
            B-QMPREIS      1.000     1.000     1.000        11
            B-STRASSE      0.917     0.786     0.846  