In [1]:
import json
from datasets import Dataset

with open("./annotations2.jsonl") as jsonl_file: # . instead of ..
    lines = jsonl_file.readlines()
annot = [json.loads(line) for line in lines]
print("instances:\n{}".format(len(annot)))
keys = [key for key in annot[0].keys()]
print("\nall keys:\n{}".format(keys))
key_keys = ["text", "spans", "tokens"]
print("\nimportant keys:\n{}".format(key_keys))
print("\nexample text:\n{}".format(annot[0]["text"]))
n_examples = 5
print("\n{} example spans:".format(n_examples))
for span in annot[0]["spans"][:n_examples]:
    print("{}".format(span))
print("\n{} example tokens:".format(n_examples))
for token in annot[0]["tokens"][:n_examples]:
    print("{}".format(token))

instances:
140

all keys:
['text', 'meta', '_input_hash', '_task_hash', 'spans', 'tokens', '_view_id', 'answer', '_timestamp']

important keys:
['text', 'spans', 'tokens']

example text:
DORNBIRN In der Schulgasse in Dornbirn hat eine 71,93 Quadratmeter große Wohnung für einen Quadratmeterpreis von 5533,71 Euro den Besitzer gewechselt. Dieser beinhaltet auch einen Pkw-Abstellplatz. Käufer der Wohnung mit 9,86 Quadratmetern Terrasse ist die ValLiLean Beteiligungs- und Immobilienverwaltungs GmbH. Beim Verkäufer handelt es sich um die Karrenblick Projekt GmbH.  Der Kaufpreis liegt bei 398.040 Euro. Unterzeichnet wurde der Kaufvertrag am 18. September. Die Verbücherung datiert mit Oktober 2020.

5 example spans:
{'text': 'DORNBIRN', 'start': 0, 'end': 8, 'pattern': 2069086582, 'token_start': 0, 'token_end': 0, 'label': 'ORT', 'noWords': 1}
{'start': 16, 'end': 26, 'token_start': 3, 'token_end': 3, 'label': 'STRASSE', 'noWords': 1}
{'text': 'Dornbirn', 'start': 30, 'end': 38, 'pattern': 206

In [2]:
def getLabel(tokenDictList, idx):
    result = "O"
    for dict_i in tokenDictList:
        idx_0, idx_1 = dict_i["start"], dict_i["end"]
        if (idx_0<=idx) and (idx<=idx_1):
            result = dict_i["label"]
    return result 

for j in range(len(annot)): # loop over instances
    a = annot[j]            # instance j
    spans = a["spans"]      # list of annotation dicts
    toks = a["tokens"]      # list of token dicts
    for i in range(len(toks)):                                 # loop over token dicts
        toks[i]["label"] = getLabel(spans, toks[i]["start"])   # assign label from span (if exists, otherwise "O")
    annot[j]["tokens"] = toks

words_n = 3
for i in range(2):
    print("Token dictionaries for the last {} words of instance {}".format(words_n, i))
    ann = annot[i]
    for tok in ann["tokens"][:words_n]:
        print(tok)

Token dictionaries for the last 3 words of instance 0
{'text': 'DORNBIRN', 'start': 0, 'end': 8, 'id': 0, 'ws': True, 'label': 'ORT'}
{'text': 'In', 'start': 9, 'end': 11, 'id': 1, 'ws': True, 'label': 'O'}
{'text': 'der', 'start': 12, 'end': 15, 'id': 2, 'ws': True, 'label': 'O'}
Token dictionaries for the last 3 words of instance 1
{'text': 'FELDKIRCH', 'start': 0, 'end': 9, 'id': 0, 'ws': True, 'label': 'ORT'}
{'text': 'Im', 'start': 10, 'end': 12, 'id': 1, 'ws': True, 'label': 'O'}
{'text': 'Altenreuteweg', 'start': 13, 'end': 26, 'id': 2, 'ws': True, 'label': 'STRASSE'}


In [3]:
sents=[] 
for annot_i in annot:                  # loop over instances
    toks = annot_i['tokens']           # get tokens list for instance i
    train_sentence = []
    for tok in toks:                   # loop over token dicts
        if 'label' in tok:             # only if the current token has been labelled, ...
            token_element = (tok['text'], tok['label']) # ... create a "text", "label" pair for this token ...
            train_sentence.append(token_element)        # ... and append it to the list
    sents.append(train_sentence) # append the list for that instances to the list for all instances / sentences

# list of lists of pairs (sets): outer list contains instances and inner list contains (token, label) pairs
sents[:2]

[[('DORNBIRN', 'ORT'),
  ('In', 'O'),
  ('der', 'O'),
  ('Schulgasse', 'STRASSE'),
  ('in', 'O'),
  ('Dornbirn', 'ORT'),
  ('hat', 'O'),
  ('eine', 'O'),
  ('71,93', 'FLAECHE'),
  ('Quadratmeter', 'O'),
  ('große', 'O'),
  ('Wohnung', 'IMMO_TYP'),
  ('für', 'O'),
  ('einen', 'O'),
  ('Quadratmeterpreis', 'O'),
  ('von', 'O'),
  ('5533,71', 'QMPREIS'),
  ('Euro', 'O'),
  ('den', 'O'),
  ('Besitzer', 'O'),
  ('gewechselt', 'O'),
  ('.', 'O'),
  ('Dieser', 'O'),
  ('beinhaltet', 'O'),
  ('auch', 'O'),
  ('einen', 'O'),
  ('Pkw-Abstellplatz', 'O'),
  ('.', 'O'),
  ('Käufer', 'O'),
  ('der', 'O'),
  ('Wohnung', 'O'),
  ('mit', 'O'),
  ('9,86', 'TERRASSENGROESSE'),
  ('Quadratmetern', 'O'),
  ('Terrasse', 'O'),
  ('ist', 'O'),
  ('die', 'O'),
  ('ValLiLean', 'KAEUFER'),
  ('Beteiligungs-', 'KAEUFER'),
  ('und', 'KAEUFER'),
  ('Immobilienverwaltungs', 'KAEUFER'),
  ('GmbH', 'KAEUFER'),
  ('.', 'KAEUFER'),
  ('Beim', 'O'),
  ('Verkäufer', 'O'),
  ('handelt', 'O'),
  ('es', 'O'),
  ('sich', 'O'

In [4]:
# 1. build tokens = list of lists of tokens
# 2. build labels = list of lists of labels
words = []
ner_tags = []
for sent_i in sents:
    words_i = []
    ner_tags_i = []
    for item in sent_i:
        words_i.append(item[0])
        ner_tags_i.append(item[1])
    words.append(words_i)
    ner_tags.append(ner_tags_i)

dataset = Dataset.from_dict({"words": words, "ner_tags": ner_tags})
dataset

Dataset({
    features: ['words', 'ner_tags'],
    num_rows: 140
})

In [5]:
# set of labels
ner_tag_names = sorted(list(set([ner_tag_ij for ner_tags_i in ner_tags for ner_tag_ij in ner_tags_i])))
ner_tag_names

['DATUM_VERBUECHERUNG',
 'DATUM_VERTRAG',
 'FLAECHE',
 'GESAMTPREIS',
 'IMMO_TYP',
 'KAEUFER',
 'O',
 'ORT',
 'QMPREIS',
 'STRASSE',
 'TERRASSENGROESSE',
 'VERKAEUFER']

In [6]:
dataset.features

{'words': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [7]:
# ner_feature = raw_datasets["train"].features["ner_tags"]
ner_feature = dataset.features["words"]
ner_feature

Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)

In [8]:
#dataset.features["tokens"].feature.names = label_names # ClassLabel(names=label_names)
from datasets import ClassLabel
dataset.features["words"].feature.names = ClassLabel(names=ner_tag_names)
dataset.features["words"].feature.names

ClassLabel(num_classes=12, names=['DATUM_VERBUECHERUNG', 'DATUM_VERTRAG', 'FLAECHE', 'GESAMTPREIS', 'IMMO_TYP', 'KAEUFER', 'O', 'ORT', 'QMPREIS', 'STRASSE', 'TERRASSENGROESSE', 'VERKAEUFER'], id=None)

In [9]:
dataset.features["words"].feature.names.names

['DATUM_VERBUECHERUNG',
 'DATUM_VERTRAG',
 'FLAECHE',
 'GESAMTPREIS',
 'IMMO_TYP',
 'KAEUFER',
 'O',
 'ORT',
 'QMPREIS',
 'STRASSE',
 'TERRASSENGROESSE',
 'VERKAEUFER']

In [10]:
# tokenize and adjust labels
from transformers import AutoTokenizer
checkpoint = "flair/ner-german"  # https://huggingface.co/flair/ner-german (1.41GB)
checkpoint = "fhswf/bert_de_ner" # https://huggingface.co/fhswf/bert_de_ner (419MB)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.is_fast

True

In [11]:
dataset

Dataset({
    features: ['words', 'ner_tags'],
    num_rows: 140
})

In [12]:
min_len = 1000
min_len_list = []
min_len_inst = []
for inst in range(140):
    inputs = tokenizer(dataset[inst]["words"], is_split_into_words=True)
    inst_len = len(inputs.word_ids())
    if 18<inst_len and inst_len< 80:
        min_len_list.append(inst_len)
        min_len_inst.append(inst)
min_len_list, min_len_inst

([77, 78, 76, 73], [46, 48, 50, 73])

In [13]:
#instance = 73 # 95 => (11, 11, 9) | 118 => (13, 13, 12)
#n_0 = 0
#n_1 = -1
#inputs = tokenizer(dataset[instance]["words"], is_split_into_words=True)
#print(f"length of input.word_ids():\t{len(inputs.word_ids())}")
#print(f"input.word_ids() {n_0}-{n_1}:\t\t{inputs.word_ids()[n_0:n_1]}")
#print()
#print(f"length of input.tokens():\t{len(inputs.tokens())}")
#print(f"input.tokens() {n_0}-{n_1}:\t\t{inputs.tokens()[n_0:n_1]}")
#print()
#ner_tags = dataset[instance]["ner_tags"] # do not tokenize the "ner_tags" ...
## ... just repeat them as often as necessary to match the length
#print(f"length of ner_tags:\t\t{len(ner_tags)}")
#print(f"ner_tags {n_0}-{n_1}:\t\t\t{ner_tags[n_0:n_1]}")
## create new "labels" as follows:
## 1. for every word that is split into 1+n tokens (see input.word_ids()), the according entry in ner_tags needs ...
## ... to be repeated n times (=> the "labels" will correspond to "ner_tags" but match the length of input.word_ids)
## 2. ignore special tokens with a word ID of "None" by attributing them a label of -100 (=> special tokens ignored)
## 3. when done, turn those steps into a function "align_labels_with_tokens(labels, word_ids)" returning "new_labels"

In [14]:
def align_labels_with_tokens(ner_tags, word_ids):
    #print(word_ids)
    new_labels = []
    previous_label = None
    previous_word_id = None
    for word_id in word_ids:
        #print(word_id)
        # handle None
        if word_id==None:
            label = -100
        # handle word_id==previous_word_id
        elif word_id==previous_word_id:
            label = previous_label
        # handle word_id!=previous_word_id and word_id!=None
        else:
            text_label = ner_tags[word_id]
            label = ner_tag_names.index(text_label)
        new_labels.append(label)
        previous_label = label
        previous_word_id = word_id
    #new_labels = []
    #previous_word_id = None
    #for word_id in word_ids:
    #    if (word_id==None) or (word_id==previous_word_id):
    #        label = -100
    #    else:
    #        label = labels[word_id]
    #    new_labels.append(label)
    #    previous_word_id = word_id
    return new_labels

# loop over tokens, print "inputs.tokens() token", word_id, aligned_label


instance = 5 # 118 => goal = [-100]
inputs = tokenizer(dataset[instance]["words"], is_split_into_words=True)
ner_tags = dataset[instance]["ner_tags"]
#print(inputs.tokens())
word_ids = tokenizer(dataset[instance]["words"], is_split_into_words=True).word_ids()
#print(len(ner_tags), ner_tags)
#print(len(word_ids), word_ids)
#print()
aligned_labels = align_labels_with_tokens(ner_tags, word_ids)
#print(len(aligned_labels), aligned_labels)
for i, token in enumerate(inputs.tokens()):
    alabel = aligned_labels[i]
    tlabel = ner_tag_names[alabel] if alabel>=0 else "SPECIAL TOKEN"
    #isApple = True if fruit == 'Apple' else False
    print(f"index: {i}\ttoken: {token}\tword_id: {word_ids[i]}\taligned label: {alabel}\ttext label: {tlabel}")
len(inputs.tokens()), len(aligned_labels), len(word_ids)

index: 0	token: [CLS]	word_id: None	aligned label: -100	text label: SPECIAL TOKEN
index: 1	token: ko	word_id: 0	aligned label: 7	text label: ORT
index: 2	token: ##bl	word_id: 0	aligned label: 7	text label: ORT
index: 3	token: ##ach	word_id: 0	aligned label: 7	text label: ORT
index: 4	token: im	word_id: 1	aligned label: 6	text label: O
index: 5	token: ber	word_id: 2	aligned label: 9	text label: STRASSE
index: 6	token: ##eich	word_id: 2	aligned label: 9	text label: STRASSE
index: 7	token: no	word_id: 3	aligned label: 9	text label: STRASSE
index: 8	token: ##llen	word_id: 3	aligned label: 9	text label: STRASSE
index: 9	token: in	word_id: 4	aligned label: 6	text label: O
index: 10	token: ko	word_id: 5	aligned label: 7	text label: ORT
index: 11	token: ##bl	word_id: 5	aligned label: 7	text label: ORT
index: 12	token: ##ach	word_id: 5	aligned label: 7	text label: ORT
index: 13	token: hat	word_id: 6	aligned label: 6	text label: O
index: 14	token: ein	word_id: 7	aligned label: 6	text label: O
in

(116, 116, 116)

In [18]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["words"], truncation=True, is_split_into_words=True)
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs
#
tokenized_dataset = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset.column_names
)
tokenized_dataset

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 140
})

In [19]:
tokenized_dataset["labels"]

[[-100,
  7,
  7,
  7,
  6,
  6,
  9,
  9,
  6,
  7,
  7,
  7,
  6,
  6,
  2,
  2,
  2,
  6,
  6,
  6,
  4,
  4,
  6,
  6,
  6,
  6,
  6,
  6,
  8,
  8,
  8,
  8,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  10,
  10,
  10,
  6,
  6,
  6,
  6,
  6,
  6,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  5,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  11,
  6,
  6,
  6,
  6,
  6,
  3,
  3,
  3,
  3,
  3,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  1,
  1,
  1,
  1,
  1,
  1,
  6,
  6,
  6,
  6,
  6,
  6,
  0,
  0,
  0,
  0,
  0,
  -100],
 [-100,
  7,
  7,
  7,
  6,
  9,
  9,
  9,
  9,
  6,
  7,
  7,
  7,
  6,
  6,
  2,
  2,
  2,
  6,
  6,
  6,
  4,
  4,
  6,
  6,
  6,
  6,
  6,
  6,
  8,
  8,
  8,
  8,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  10,
  10,
  10,
  6,
  6,
  6,
  6,
  6,
  6,
  6,
  5,
  5,
  5,
  6,


In [None]:
Dataset({
    features: ['words', 'ner_tags', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 140
})

In [20]:
len(ner_tag_names), ner_tag_names

(12,
 ['DATUM_VERBUECHERUNG',
  'DATUM_VERTRAG',
  'FLAECHE',
  'GESAMTPREIS',
  'IMMO_TYP',
  'KAEUFER',
  'O',
  'ORT',
  'QMPREIS',
  'STRASSE',
  'TERRASSENGROESSE',
  'VERKAEUFER'])

In [14]:
instance = 118 # 95 => (11, 11, 9) | 118 => (13, 13, 12)
n_0 = 0
n_1 = 16
inputs = tokenizer(dataset[instance]["words"], is_split_into_words=True)
print(f"length of input.word_ids():\t{len(inputs.word_ids())}")
print(f"input.word_ids() {n_0}-{n_1}:\t\t{inputs.word_ids()[n_0:n_1]}")
print()
print(f"length of input.tokens():\t{len(inputs.tokens())}")
print(f"input.tokens() {n_0}-{n_1}:\t\t{inputs.tokens()[n_0:n_1]}")
print()
ner_tags = tokenizer(dataset[instance]["ner_tags"], is_split_into_words=True) # do not tokenize the "ner_tags" ...
# ... just repeat them as often as necessary to match the length
print(f"length of ner_tags.input_ids:\t{len(ner_tags.input_ids)}")
print(f"ner_tags.input_ids {n_0}-{n_1}:\t{ner_tags.input_ids[n_0:n_1]}")
# for every word that is split into 1+n tokens, the input_id (see ner_tags.input_ids) of that word needs to be ...
# ... repeated n times

length of input.word_ids():	13
input.word_ids() 0-16:		[None, 0, 0, 0, 1, 2, 3, 4, 4, 5, 6, 6, None]

length of input.tokens():	13
input.tokens() 0-16:		['[CLS]', 'fe', '##ld', '##kirch', ':', '13', 'ha', 'bzw', '.', '1', 'pro', '##zent', '[SEP]']

length of ner_tags.input_ids:	12
ner_tags.input_ids 0-16:	[102, 1828, 30885, 1828, 30885, 256, 256, 256, 18553, 3957, 256, 103]


In [15]:
# convert input_ids to tokens via tokenizer:
# > https://discuss.huggingface.co/t/tokenizer-decoding-using-bert-roberta-xlnet-gpt2/1128
print(ner_tags.input_ids)
print(tokenizer.decode(ner_tags.input_ids))
print(tokenizer.convert_ids_to_tokens(ner_tags.input_ids))

[102, 1828, 30885, 1828, 30885, 256, 256, 256, 18553, 3957, 256, 103]
[CLS] ort ort o o o gesamtpreis o [SEP]
['[CLS]', 'or', '##t', 'or', '##t', 'o', 'o', 'o', 'gesamt', '##preis', 'o', '[SEP]']


In [16]:
sents[instance]

[('Feldkirch', 'ORT'),
 (':', 'ORT'),
 ('13', 'O'),
 ('ha', 'O'),
 ('bzw.', 'O'),
 ('1', 'GESAMTPREIS'),
 ('Prozent', 'O')]

In [14]:
inputs.tokens()

['[CLS]',
 'dor',
 '##nb',
 '##irn',
 'in',
 'der',
 'schul',
 '##gasse',
 'in',
 'dor',
 '##nb',
 '##irn',
 'hat',
 'eine',
 '71',
 ',',
 '93',
 'quadrat',
 '##meter',
 'große',
 'wohn',
 '##ung',
 'fur',
 'einen',
 'quadrat',
 '##meter',
 '##preis',
 'von',
 '55',
 '##33',
 ',',
 '71',
 'eur',
 '##o',
 'den',
 'bes',
 '##itzer',
 'gewechselt',
 '.',
 'dieser',
 'beinhaltet',
 'auch',
 'einen',
 'p',
 '##kw',
 '-',
 'abs',
 '##tell',
 '##platz',
 '.',
 'kauf',
 '##er',
 'der',
 'wohn',
 '##ung',
 'mit',
 '9',
 ',',
 '86',
 'quadrat',
 '##metern',
 'ter',
 '##rasse',
 'ist',
 'die',
 'val',
 '##li',
 '##le',
 '##an',
 'beteil',
 '##igungs',
 '-',
 'und',
 'imm',
 '##obilien',
 '##verwaltungs',
 'g',
 '##mb',
 '##h',
 '.',
 'beim',
 'verkauf',
 '##er',
 'handelt',
 'es',
 'sich',
 'um',
 'die',
 'kar',
 '##ren',
 '##blick',
 'pro',
 '##jekt',
 'g',
 '##mb',
 '##h',
 '.',
 'der',
 'kauf',
 '##preis',
 'liegt',
 'bei',
 '39',
 '##8',
 '.',
 '04',
 '##0',
 'eur',
 '##o',
 '.',
 'unterzeich

In [24]:
ner_tags = tokenizer(dataset[0]["ner_tags"], is_split_into_words=True)
len(ner_tags["input_ids"]), ner_tags["input_ids"]

(141,
 [102,
  1828,
  30885,
  256,
  256,
  8012,
  13410,
  256,
  1828,
  30885,
  256,
  256,
  3025,
  2956,
  441,
  256,
  256,
  30363,
  30892,
  2032,
  5506,
  256,
  256,
  256,
  256,
  6520,
  17157,
  667,
  256,
  256,
  256,
  256,
  256,
  256,
  256,
  256,
  256,
  256,
  256,
  256,
  256,
  256,
  256,
  7058,
  16610,
  4798,
  26576,
  13410,
  256,
  256,
  256,
  256,
  13419,
  274,
  598,
  13419,
  274,
  598,
  13419,
  274,
  598,
  13419,
  274,
  598,
  13419,
  274,
  598,
  13419,
  274,
  598,
  256,
  256,
  256,
  256,
  256,
  256,
  256,
  4607,
  2956,
  12254,
  4607,
  2956,
  12254,
  4607,
  2956,
  12254,
  4607,
  2956,
  12254,
  256,
  256,
  256,
  256,
  256,
  18553,
  3957,
  256,
  256,
  256,
  256,
  256,
  256,
  256,
  11692,
  182,
  2032,
  30762,
  11692,
  182,
  2032,
  30762,
  11692,
  182,
  2032,
  30762,
  256,
  256,
  256,
  256,
  11692,
  182,
  2032,
  1155,
  535,
  1571,
  132,
  11692,
  182,
  2032,
  1155,
 

In [26]:
len(inputs.word_ids())

134

In [None]:
def adjust_word_ids(word_ids):
    for word_id in word_ids:
        

In [15]:
len(inputs.word_ids()), len(inputs.tokens())

(134, 134)

In [14]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
        elif word_id is None:
            # Special token
            label = -100
        else:
            # Same word as previous token
            label = labels[word_id]
        new_labels.append(label)
    return new_labels

In [15]:
labels = dataset[0]["labels"]
word_ids = inputs.word_ids()
aligned_labels = align_labels_with_tokens(labels, word_ids)
print(aligned_labels)
print(len(aligned_labels))
print(inputs.tokens())
print(len(inputs.tokens()))

[-100, 'ORT', 'ORT', 'ORT', 'O', 'O', 'STRASSE', 'STRASSE', 'O', 'ORT', 'ORT', 'ORT', 'O', 'O', 'FLAECHE', 'FLAECHE', 'FLAECHE', 'O', 'O', 'O', 'IMMO_TYP', 'IMMO_TYP', 'O', 'O', 'O', 'O', 'O', 'O', 'QMPREIS', 'QMPREIS', 'QMPREIS', 'QMPREIS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'TERRASSENGROESSE', 'TERRASSENGROESSE', 'TERRASSENGROESSE', 'O', 'O', 'O', 'O', 'O', 'O', 'KAEUFER', 'KAEUFER', 'KAEUFER', 'KAEUFER', 'KAEUFER', 'KAEUFER', 'KAEUFER', 'KAEUFER', 'KAEUFER', 'KAEUFER', 'KAEUFER', 'KAEUFER', 'KAEUFER', 'KAEUFER', 'KAEUFER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'VERKAEUFER', 'VERKAEUFER', 'VERKAEUFER', 'VERKAEUFER', 'VERKAEUFER', 'VERKAEUFER', 'VERKAEUFER', 'VERKAEUFER', 'VERKAEUFER', 'O', 'O', 'O', 'O', 'O', 'GESAMTPREIS', 'GESAMTPREIS', 'GESAMTPREIS', 'GESAMTPREIS', 'GESAMTPREIS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'DATUM_VERTRAG', 'DATUM_VERTRAG', 'DATUM_VERTRAG', 'DATUM_VERTRAG', 'DAT

In [16]:
import transformers
transformers.__version__

'4.17.0'

In [18]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    all_labels = examples["labels"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [19]:
tokenized_dataset = dataset.map(
    tokenize_and_align_labels,
    batched=True
)
tokenized_dataset

  0%|          | 0/1 [00:00<?, ?ba/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


ArrowInvalid: Could not convert 'ORT' with type str: tried to convert to int64