# Building a Huggingface `DatasetDict` for the *gNER* dataset
**Tidy up and polish everything!**<br>
The resulting dataset is stored in the folder *gNERdataset* and can be loaded via `load_from_disk("gNERdataset")`. However, the `ClassLabel` list with the names of the NER tags is not available anymore when loading from disk!

In [1]:
import json
from sklearn.utils import shuffle
from datasets import Dataset
from datasets import DatasetDict

with open("./annotations2.jsonl") as jsonl_file: # . instead of ..
    lines = jsonl_file.readlines()
annot = [json.loads(line) for line in lines]
print("instances:\n{}".format(len(annot)))
keys = [key for key in annot[0].keys()]
print("\nall keys:\n{}".format(keys))
key_keys = ["text", "spans", "tokens"]
print("\nimportant keys:\n{}".format(key_keys))
print("\nexample text:\n{}".format(annot[0]["text"]))
n_examples = 5
print("\n{} example spans:".format(n_examples))
for span in annot[0]["spans"][:n_examples]:
    print("{}".format(span))
print("\n{} example tokens:".format(n_examples))
for token in annot[0]["tokens"][:n_examples]:
    print("{}".format(token))

instances:
140

all keys:
['text', 'meta', '_input_hash', '_task_hash', 'spans', 'tokens', '_view_id', 'answer', '_timestamp']

important keys:
['text', 'spans', 'tokens']

example text:
DORNBIRN In der Schulgasse in Dornbirn hat eine 71,93 Quadratmeter große Wohnung für einen Quadratmeterpreis von 5533,71 Euro den Besitzer gewechselt. Dieser beinhaltet auch einen Pkw-Abstellplatz. Käufer der Wohnung mit 9,86 Quadratmetern Terrasse ist die ValLiLean Beteiligungs- und Immobilienverwaltungs GmbH. Beim Verkäufer handelt es sich um die Karrenblick Projekt GmbH.  Der Kaufpreis liegt bei 398.040 Euro. Unterzeichnet wurde der Kaufvertrag am 18. September. Die Verbücherung datiert mit Oktober 2020.

5 example spans:
{'text': 'DORNBIRN', 'start': 0, 'end': 8, 'pattern': 2069086582, 'token_start': 0, 'token_end': 0, 'label': 'ORT', 'noWords': 1}
{'start': 16, 'end': 26, 'token_start': 3, 'token_end': 3, 'label': 'STRASSE', 'noWords': 1}
{'text': 'Dornbirn', 'start': 30, 'end': 38, 'pattern': 206

In [2]:
def getLabel(tokenDictList, idx):
    result = "O"
    for dict_i in tokenDictList:
        idx_0, idx_1 = dict_i["start"], dict_i["end"]
        if (idx_0<=idx) and (idx<=idx_1):
            result = dict_i["label"]
    return result 

for j in range(len(annot)): # loop over instances
    a = annot[j]            # instance j
    spans = a["spans"]      # list of annotation dicts
    toks = a["tokens"]      # list of token dicts
    for i in range(len(toks)):                                 # loop over token dicts
        toks[i]["label"] = getLabel(spans, toks[i]["start"])   # assign label from span (if exists, otherwise "O")
    annot[j]["tokens"] = toks

words_n = 3
for i in range(2):
    print("Token dictionaries for the last {} words of instance {}".format(words_n, i))
    ann = annot[i]
    for tok in ann["tokens"][:words_n]:
        print(tok)

Token dictionaries for the last 3 words of instance 0
{'text': 'DORNBIRN', 'start': 0, 'end': 8, 'id': 0, 'ws': True, 'label': 'ORT'}
{'text': 'In', 'start': 9, 'end': 11, 'id': 1, 'ws': True, 'label': 'O'}
{'text': 'der', 'start': 12, 'end': 15, 'id': 2, 'ws': True, 'label': 'O'}
Token dictionaries for the last 3 words of instance 1
{'text': 'FELDKIRCH', 'start': 0, 'end': 9, 'id': 0, 'ws': True, 'label': 'ORT'}
{'text': 'Im', 'start': 10, 'end': 12, 'id': 1, 'ws': True, 'label': 'O'}
{'text': 'Altenreuteweg', 'start': 13, 'end': 26, 'id': 2, 'ws': True, 'label': 'STRASSE'}


In [3]:
sents=[] 
for annot_i in annot:                  # loop over instances
    toks = annot_i['tokens']           # get tokens list for instance i
    train_sentence = []
    for tok in toks:                   # loop over token dicts
        if 'label' in tok:             # only if the current token has been labelled, ...
            token_element = (tok['text'], tok['label']) # ... create a "text", "label" pair for this token ...
            train_sentence.append(token_element)        # ... and append it to the list
    sents.append(train_sentence) # append the list for that instances to the list for all instances / sentences

# list of lists of pairs (sets): outer list contains instances and inner list contains (token, label) pairs
sents[:2]

[[('DORNBIRN', 'ORT'),
  ('In', 'O'),
  ('der', 'O'),
  ('Schulgasse', 'STRASSE'),
  ('in', 'O'),
  ('Dornbirn', 'ORT'),
  ('hat', 'O'),
  ('eine', 'O'),
  ('71,93', 'FLAECHE'),
  ('Quadratmeter', 'O'),
  ('große', 'O'),
  ('Wohnung', 'IMMO_TYP'),
  ('für', 'O'),
  ('einen', 'O'),
  ('Quadratmeterpreis', 'O'),
  ('von', 'O'),
  ('5533,71', 'QMPREIS'),
  ('Euro', 'O'),
  ('den', 'O'),
  ('Besitzer', 'O'),
  ('gewechselt', 'O'),
  ('.', 'O'),
  ('Dieser', 'O'),
  ('beinhaltet', 'O'),
  ('auch', 'O'),
  ('einen', 'O'),
  ('Pkw-Abstellplatz', 'O'),
  ('.', 'O'),
  ('Käufer', 'O'),
  ('der', 'O'),
  ('Wohnung', 'O'),
  ('mit', 'O'),
  ('9,86', 'TERRASSENGROESSE'),
  ('Quadratmetern', 'O'),
  ('Terrasse', 'O'),
  ('ist', 'O'),
  ('die', 'O'),
  ('ValLiLean', 'KAEUFER'),
  ('Beteiligungs-', 'KAEUFER'),
  ('und', 'KAEUFER'),
  ('Immobilienverwaltungs', 'KAEUFER'),
  ('GmbH', 'KAEUFER'),
  ('.', 'KAEUFER'),
  ('Beim', 'O'),
  ('Verkäufer', 'O'),
  ('handelt', 'O'),
  ('es', 'O'),
  ('sich', 'O'

In [4]:
# 1. build tokens = list of lists of tokens
# 2. build labels = list of lists of labels
words = []
ner_tags = []
for sent_i in sents:
    words_i = []
    ner_tags_i = []
    for item in sent_i:
        words_i.append(item[0])
        ner_tags_i.append(item[1])
    words.append(words_i)
    ner_tags.append(ner_tags_i)

len(words), len(ner_tags)

(140, 140)

In [5]:
train_ratio = 0.75
train_test_split = round(0.75*len(words) - 0.5) # -0.5 => floor
idx = [i for i in range(len(words))]
print(idx[:5])
idx_shuffle = shuffle(idx, random_state=0)
print(idx_shuffle[:5])
words_shuffle, ner_tags_shuffle = [words[idx_i] for idx_i in idx_shuffle], [ner_tags[idx_i] for idx_i in idx_shuffle]
words_train, words_test = words_shuffle[:train_test_split], words_shuffle[train_test_split:]
ner_tags_train, ner_tags_test = ner_tags_shuffle[:train_test_split], ner_tags_shuffle[train_test_split:]
len(words_train), len(ner_tags_train), len(words_test), len(ner_tags_test)

[0, 1, 2, 3, 4]
[45, 59, 7, 50, 92]


(104, 104, 36, 36)

In [6]:
train_dataset = Dataset.from_dict({"words": words_train, "ner_tags": ner_tags_train})
test_dataset = Dataset.from_dict({"words": words_test, "ner_tags": ner_tags_test})
train_valid_split = train_dataset.train_test_split(shuffle=True, seed=42, test_size=0.25)
untokenizedDatasetDict = DatasetDict({
    "train": train_valid_split["train"],
    "valid": train_valid_split["test"],
    "test": test_dataset
})
untokenizedDatasetDict

DatasetDict({
    train: Dataset({
        features: ['words', 'ner_tags'],
        num_rows: 78
    })
    valid: Dataset({
        features: ['words', 'ner_tags'],
        num_rows: 26
    })
    test: Dataset({
        features: ['words', 'ner_tags'],
        num_rows: 36
    })
})

In [7]:
# set of labels
ner_tag_names = sorted(list(set([ner_tag_ij for ner_tags_i in ner_tags for ner_tag_ij in ner_tags_i])))
ner_tag_names

['DATUM_VERBUECHERUNG',
 'DATUM_VERTRAG',
 'FLAECHE',
 'GESAMTPREIS',
 'IMMO_TYP',
 'KAEUFER',
 'O',
 'ORT',
 'QMPREIS',
 'STRASSE',
 'TERRASSENGROESSE',
 'VERKAEUFER']

In [8]:
# tokenize and adjust labels
from transformers import AutoTokenizer
checkpoint = "flair/ner-german"  # https://huggingface.co/flair/ner-german (1.41GB)
checkpoint = "fhswf/bert_de_ner" # https://huggingface.co/fhswf/bert_de_ner (419MB)
checkpoint = "bert-base-cased"   # https://huggingface.co/bert-base-cased (416MB)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.is_fast

True

In [15]:
def align_labels_with_tokens(ner_tags, word_ids):
    #print(word_ids)
    new_labels = []
    previous_label = None
    previous_word_id = None
    for word_id in word_ids:
        # handle word_id==None
        if word_id==None:
            label = -100
        # handle word_id==previous_word_id
        elif word_id==previous_word_id:
            label = previous_label
        # handle word_id!=previous_word_id and word_id!=None
        else:
            text_label = ner_tags[word_id]
            label = ner_tag_names.index(text_label)
        new_labels.append(label)
        previous_label = label
        previous_word_id = word_id
    return new_labels

instance = 0
ner_tags = untokenizedDatasetDict["train"][instance]["ner_tags"]
word_ids = tokenizer(untokenizedDatasetDict["train"][instance]["words"], is_split_into_words=True).word_ids()
aligned_labels = align_labels_with_tokens(ner_tags, word_ids)
inputs = tokenizer(untokenizedDatasetDict["train"][instance]["words"], is_split_into_words=True)
for i, token in enumerate(inputs.tokens()):
    alabel = aligned_labels[i]
    tlabel = ner_tag_names[alabel] if alabel>=0 else "SPECIAL TOKEN"
    print(f"index: {i}\ttoken: {token}\tword_id: {word_ids[i]}\taligned label: {alabel}\ttext label: {tlabel}")
len(inputs.tokens()), len(aligned_labels), len(word_ids)

index: 0	token: [CLS]	word_id: None	aligned label: -100	text label: SPECIAL TOKEN
index: 1	token: L	word_id: 0	aligned label: 7	text label: ORT
index: 2	token: ##US	word_id: 0	aligned label: 7	text label: ORT
index: 3	token: ##TE	word_id: 0	aligned label: 7	text label: ORT
index: 4	token: ##NA	word_id: 0	aligned label: 7	text label: ORT
index: 5	token: ##U	word_id: 0	aligned label: 7	text label: ORT
index: 6	token: In	word_id: 1	aligned label: 6	text label: O
index: 7	token: der	word_id: 2	aligned label: 6	text label: O
index: 8	token: Stein	word_id: 3	aligned label: 9	text label: STRASSE
index: 9	token: ##ack	word_id: 3	aligned label: 9	text label: STRASSE
index: 10	token: ##ers	word_id: 3	aligned label: 9	text label: STRASSE
index: 11	token: ##tra	word_id: 3	aligned label: 9	text label: STRASSE
index: 12	token: ##ße	word_id: 3	aligned label: 9	text label: STRASSE
index: 13	token: 26	word_id: 4	aligned label: 9	text label: STRASSE
index: 14	token: in	word_id: 5	aligned label: 6	text l

(176, 176, 176)

In [10]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["words"], truncation=True, is_split_into_words=True)
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs
#
gNerDatasetDict = untokenizedDatasetDict.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=untokenizedDatasetDict["train"].column_names
)
gNerDatasetDict

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 78
    })
    valid: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 26
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 36
    })
})

In [11]:
from datasets import ClassLabel
gNerDatasetDict["train"].features["labels"].feature.names = ClassLabel(names=ner_tag_names)
print(gNerDatasetDict["train"].features["labels"].feature.names)
gNerDatasetDict["train"].features["labels"].feature.names.names

ClassLabel(num_classes=12, names=['DATUM_VERBUECHERUNG', 'DATUM_VERTRAG', 'FLAECHE', 'GESAMTPREIS', 'IMMO_TYP', 'KAEUFER', 'O', 'ORT', 'QMPREIS', 'STRASSE', 'TERRASSENGROESSE', 'VERKAEUFER'], id=None)


['DATUM_VERBUECHERUNG',
 'DATUM_VERTRAG',
 'FLAECHE',
 'GESAMTPREIS',
 'IMMO_TYP',
 'KAEUFER',
 'O',
 'ORT',
 'QMPREIS',
 'STRASSE',
 'TERRASSENGROESSE',
 'VERKAEUFER']

In [12]:
gNerDatasetDict.save_to_disk("gNERdataset")

In [13]:
from datasets import load_from_disk
gNerDatasetCheck = load_from_disk("gNERdataset")
# Note, however, that the "ClassLabel" list with the names of the NER tags is not available, anymore!
gNerDatasetCheck

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 78
    })
    valid: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 26
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 36
    })
})

$\checkmark$