In [1]:
#%matplotlib inline
#import sklearn
#import sklearn_crfsuite
#from sklearn_crfsuite import scorers
#from sklearn.utils import shuffle
#from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
#from sklearn_crfsuite import metrics
#from sklearn.metrics import make_scorer
#import metrics as mtx
#import scipy.stats
#import random
import json
from datasets import Dataset

with open("./annotations2.jsonl") as jsonl_file: # . instead of ..
    lines = jsonl_file.readlines()
annot = [json.loads(line) for line in lines]
print("instances:\n{}".format(len(annot)))
keys = [key for key in annot[0].keys()]
print("\nall keys:\n{}".format(keys))
key_keys = ["text", "spans", "tokens"]
print("\nimportant keys:\n{}".format(key_keys))
print("\nexample text:\n{}".format(annot[0]["text"]))
n_examples = 5
print("\n{} example spans:".format(n_examples))
for span in annot[0]["spans"][:n_examples]:
    print("{}".format(span))
print("\n{} example tokens:".format(n_examples))
for token in annot[0]["tokens"][:n_examples]:
    print("{}".format(token))

instances:
140

all keys:
['text', 'meta', '_input_hash', '_task_hash', 'spans', 'tokens', '_view_id', 'answer', '_timestamp']

important keys:
['text', 'spans', 'tokens']

example text:
DORNBIRN In der Schulgasse in Dornbirn hat eine 71,93 Quadratmeter große Wohnung für einen Quadratmeterpreis von 5533,71 Euro den Besitzer gewechselt. Dieser beinhaltet auch einen Pkw-Abstellplatz. Käufer der Wohnung mit 9,86 Quadratmetern Terrasse ist die ValLiLean Beteiligungs- und Immobilienverwaltungs GmbH. Beim Verkäufer handelt es sich um die Karrenblick Projekt GmbH.  Der Kaufpreis liegt bei 398.040 Euro. Unterzeichnet wurde der Kaufvertrag am 18. September. Die Verbücherung datiert mit Oktober 2020.

5 example spans:
{'text': 'DORNBIRN', 'start': 0, 'end': 8, 'pattern': 2069086582, 'token_start': 0, 'token_end': 0, 'label': 'ORT', 'noWords': 1}
{'start': 16, 'end': 26, 'token_start': 3, 'token_end': 3, 'label': 'STRASSE', 'noWords': 1}
{'text': 'Dornbirn', 'start': 30, 'end': 38, 'pattern': 206

In [2]:
def getLabel(tokenDictList, idx):
    result = "O"
    for dict_i in tokenDictList:
        idx_0, idx_1 = dict_i["start"], dict_i["end"]
        if (idx_0<=idx) and (idx<=idx_1):
            result = dict_i["label"]
    return result 

for j in range(len(annot)): # loop over instances
    a = annot[j]            # instance j
    spans = a["spans"]      # list of annotation dicts
    toks = a["tokens"]      # list of token dicts
    for i in range(len(toks)):                                 # loop over token dicts
        toks[i]["label"] = getLabel(spans, toks[i]["start"])   # assign label from span (if exists, otherwise "O")
    annot[j]["tokens"] = toks

words_n = 3
for i in range(2):
    print("Token dictionaries for the last {} words of instance {}".format(words_n, i))
    ann = annot[i]
    for tok in ann["tokens"][:words_n]:
        print(tok)

Token dictionaries for the last 3 words of instance 0
{'text': 'DORNBIRN', 'start': 0, 'end': 8, 'id': 0, 'ws': True, 'label': 'ORT'}
{'text': 'In', 'start': 9, 'end': 11, 'id': 1, 'ws': True, 'label': 'O'}
{'text': 'der', 'start': 12, 'end': 15, 'id': 2, 'ws': True, 'label': 'O'}
Token dictionaries for the last 3 words of instance 1
{'text': 'FELDKIRCH', 'start': 0, 'end': 9, 'id': 0, 'ws': True, 'label': 'ORT'}
{'text': 'Im', 'start': 10, 'end': 12, 'id': 1, 'ws': True, 'label': 'O'}
{'text': 'Altenreuteweg', 'start': 13, 'end': 26, 'id': 2, 'ws': True, 'label': 'STRASSE'}


In [3]:
sents=[] 
for annot_i in annot:                  # loop over instances
    toks = annot_i['tokens']           # get tokens list for instance i
    train_sentence = []
    for tok in toks:                   # loop over token dicts
        if 'label' in tok:             # only if the current token has been labelled, ...
            token_element = (tok['text'], tok['label']) # ... create a "text", "label" pair for this token ...
            train_sentence.append(token_element)        # ... and append it to the list
    sents.append(train_sentence) # append the list for that instances to the list for all instances / sentences

# list of lists of pairs (sets): outer list contains instances and inner list contains (token, label) pairs
sents[:2]

[[('DORNBIRN', 'ORT'),
  ('In', 'O'),
  ('der', 'O'),
  ('Schulgasse', 'STRASSE'),
  ('in', 'O'),
  ('Dornbirn', 'ORT'),
  ('hat', 'O'),
  ('eine', 'O'),
  ('71,93', 'FLAECHE'),
  ('Quadratmeter', 'O'),
  ('große', 'O'),
  ('Wohnung', 'IMMO_TYP'),
  ('für', 'O'),
  ('einen', 'O'),
  ('Quadratmeterpreis', 'O'),
  ('von', 'O'),
  ('5533,71', 'QMPREIS'),
  ('Euro', 'O'),
  ('den', 'O'),
  ('Besitzer', 'O'),
  ('gewechselt', 'O'),
  ('.', 'O'),
  ('Dieser', 'O'),
  ('beinhaltet', 'O'),
  ('auch', 'O'),
  ('einen', 'O'),
  ('Pkw-Abstellplatz', 'O'),
  ('.', 'O'),
  ('Käufer', 'O'),
  ('der', 'O'),
  ('Wohnung', 'O'),
  ('mit', 'O'),
  ('9,86', 'TERRASSENGROESSE'),
  ('Quadratmetern', 'O'),
  ('Terrasse', 'O'),
  ('ist', 'O'),
  ('die', 'O'),
  ('ValLiLean', 'KAEUFER'),
  ('Beteiligungs-', 'KAEUFER'),
  ('und', 'KAEUFER'),
  ('Immobilienverwaltungs', 'KAEUFER'),
  ('GmbH', 'KAEUFER'),
  ('.', 'KAEUFER'),
  ('Beim', 'O'),
  ('Verkäufer', 'O'),
  ('handelt', 'O'),
  ('es', 'O'),
  ('sich', 'O'

In [4]:
# 1. build tokens = list of lists of tokens
# 2. build labels = list of lists of labels
tokens = []
labels = []
for sent_i in sents:
    tokens_i = []
    labels_i = []
    for word_label in sent_i:
        tokens_i.append(word_label[0])
        labels_i.append(word_label[1])
    tokens.append(tokens_i)
    labels.append(labels_i)

dataset = Dataset.from_dict({"tokens": tokens, "labels": labels})
dataset

Dataset({
    features: ['tokens', 'labels'],
    num_rows: 140
})

In [5]:
# set of labels
label_names = sorted(list(set([label_ij for labels_i in labels for label_ij in labels_i])))
label_names

['DATUM_VERBUECHERUNG',
 'DATUM_VERTRAG',
 'FLAECHE',
 'GESAMTPREIS',
 'IMMO_TYP',
 'KAEUFER',
 'O',
 'ORT',
 'QMPREIS',
 'STRASSE',
 'TERRASSENGROESSE',
 'VERKAEUFER']

In [6]:
dataset.features

{'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'labels': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [7]:
# ner_feature = raw_datasets["train"].features["ner_tags"]
ner_feature = dataset.features["tokens"]
ner_feature

Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)

In [8]:
#dataset.features["tokens"].feature.names = label_names # ClassLabel(names=label_names)
from datasets import ClassLabel
dataset.features["tokens"].feature.names = ClassLabel(names=label_names)
dataset.features["tokens"].feature.names

ClassLabel(num_classes=12, names=['DATUM_VERBUECHERUNG', 'DATUM_VERTRAG', 'FLAECHE', 'GESAMTPREIS', 'IMMO_TYP', 'KAEUFER', 'O', 'ORT', 'QMPREIS', 'STRASSE', 'TERRASSENGROESSE', 'VERKAEUFER'], id=None)

In [9]:
dataset.features["tokens"].feature.names.names

['DATUM_VERBUECHERUNG',
 'DATUM_VERTRAG',
 'FLAECHE',
 'GESAMTPREIS',
 'IMMO_TYP',
 'KAEUFER',
 'O',
 'ORT',
 'QMPREIS',
 'STRASSE',
 'TERRASSENGROESSE',
 'VERKAEUFER']

In [10]:
# tokenize and adjust labels
checkpoint = "flair/ner-german"  # https://huggingface.co/flair/ner-german
checkpoint = "fhswf/bert_de_ner" # https://huggingface.co/fhswf/bert_de_ner

from transformers import AutoTokenizer
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.is_fast

True

In [11]:
dataset

Dataset({
    features: ['tokens', 'labels'],
    num_rows: 140
})

In [12]:
#inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
#inputs.word_ids()

In [13]:
inputs = tokenizer(dataset[0]["tokens"], is_split_into_words=True)
inputs.word_ids()

[None,
 0,
 0,
 0,
 1,
 2,
 3,
 3,
 4,
 5,
 5,
 5,
 6,
 7,
 8,
 8,
 8,
 9,
 9,
 10,
 11,
 11,
 12,
 13,
 14,
 14,
 14,
 15,
 16,
 16,
 16,
 16,
 17,
 17,
 18,
 19,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 26,
 26,
 26,
 26,
 26,
 27,
 28,
 28,
 29,
 30,
 30,
 31,
 32,
 32,
 32,
 33,
 33,
 34,
 34,
 35,
 36,
 37,
 37,
 37,
 37,
 38,
 38,
 38,
 39,
 40,
 40,
 40,
 41,
 41,
 41,
 42,
 43,
 44,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 50,
 50,
 51,
 51,
 52,
 52,
 52,
 53,
 55,
 56,
 56,
 57,
 58,
 59,
 59,
 59,
 59,
 59,
 60,
 60,
 61,
 62,
 63,
 64,
 65,
 65,
 66,
 67,
 67,
 68,
 68,
 68,
 69,
 70,
 71,
 71,
 71,
 72,
 73,
 74,
 74,
 74,
 75,
 76,
 None]

In [14]:
inputs.tokens()

['[CLS]',
 'dor',
 '##nb',
 '##irn',
 'in',
 'der',
 'schul',
 '##gasse',
 'in',
 'dor',
 '##nb',
 '##irn',
 'hat',
 'eine',
 '71',
 ',',
 '93',
 'quadrat',
 '##meter',
 'große',
 'wohn',
 '##ung',
 'fur',
 'einen',
 'quadrat',
 '##meter',
 '##preis',
 'von',
 '55',
 '##33',
 ',',
 '71',
 'eur',
 '##o',
 'den',
 'bes',
 '##itzer',
 'gewechselt',
 '.',
 'dieser',
 'beinhaltet',
 'auch',
 'einen',
 'p',
 '##kw',
 '-',
 'abs',
 '##tell',
 '##platz',
 '.',
 'kauf',
 '##er',
 'der',
 'wohn',
 '##ung',
 'mit',
 '9',
 ',',
 '86',
 'quadrat',
 '##metern',
 'ter',
 '##rasse',
 'ist',
 'die',
 'val',
 '##li',
 '##le',
 '##an',
 'beteil',
 '##igungs',
 '-',
 'und',
 'imm',
 '##obilien',
 '##verwaltungs',
 'g',
 '##mb',
 '##h',
 '.',
 'beim',
 'verkauf',
 '##er',
 'handelt',
 'es',
 'sich',
 'um',
 'die',
 'kar',
 '##ren',
 '##blick',
 'pro',
 '##jekt',
 'g',
 '##mb',
 '##h',
 '.',
 'der',
 'kauf',
 '##preis',
 'liegt',
 'bei',
 '39',
 '##8',
 '.',
 '04',
 '##0',
 'eur',
 '##o',
 '.',
 'unterzeich

In [15]:
len(inputs.word_ids()), len(inputs.tokens())

(134, 134)

In [14]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
        elif word_id is None:
            # Special token
            label = -100
        else:
            # Same word as previous token
            label = labels[word_id]
        new_labels.append(label)
    return new_labels

In [15]:
labels = dataset[0]["labels"]
word_ids = inputs.word_ids()
aligned_labels = align_labels_with_tokens(labels, word_ids)
print(aligned_labels)
print(len(aligned_labels))
print(inputs.tokens())
print(len(inputs.tokens()))

[-100, 'ORT', 'ORT', 'ORT', 'O', 'O', 'STRASSE', 'STRASSE', 'O', 'ORT', 'ORT', 'ORT', 'O', 'O', 'FLAECHE', 'FLAECHE', 'FLAECHE', 'O', 'O', 'O', 'IMMO_TYP', 'IMMO_TYP', 'O', 'O', 'O', 'O', 'O', 'O', 'QMPREIS', 'QMPREIS', 'QMPREIS', 'QMPREIS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'TERRASSENGROESSE', 'TERRASSENGROESSE', 'TERRASSENGROESSE', 'O', 'O', 'O', 'O', 'O', 'O', 'KAEUFER', 'KAEUFER', 'KAEUFER', 'KAEUFER', 'KAEUFER', 'KAEUFER', 'KAEUFER', 'KAEUFER', 'KAEUFER', 'KAEUFER', 'KAEUFER', 'KAEUFER', 'KAEUFER', 'KAEUFER', 'KAEUFER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'VERKAEUFER', 'VERKAEUFER', 'VERKAEUFER', 'VERKAEUFER', 'VERKAEUFER', 'VERKAEUFER', 'VERKAEUFER', 'VERKAEUFER', 'VERKAEUFER', 'O', 'O', 'O', 'O', 'O', 'GESAMTPREIS', 'GESAMTPREIS', 'GESAMTPREIS', 'GESAMTPREIS', 'GESAMTPREIS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'DATUM_VERTRAG', 'DATUM_VERTRAG', 'DATUM_VERTRAG', 'DATUM_VERTRAG', 'DAT

In [16]:
import transformers
transformers.__version__

'4.17.0'

In [18]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    all_labels = examples["labels"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [19]:
tokenized_dataset = dataset.map(
    tokenize_and_align_labels,
    batched=True
)
tokenized_dataset

  0%|          | 0/1 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


ArrowInvalid: Could not convert 'ORT' with type str: tried to convert to int64