In [1]:
from datasets import load_from_disk
gNerDataset = load_from_disk("gNERdataset")
gNerDataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 105
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 35
    })
})

In [2]:
ner_tag_names = ['DATUM_VERBUECHERUNG',
    'DATUM_VERTRAG',
    'FLAECHE',
    'GESAMTPREIS',
    'IMMO_TYP',
    'KAEUFER',
    'O',
    'ORT',
    'QMPREIS',
    'STRASSE',
    'TERRASSENGROESSE',
    'VERKAEUFER'
]
ner_tag_names

['DATUM_VERBUECHERUNG',
 'DATUM_VERTRAG',
 'FLAECHE',
 'GESAMTPREIS',
 'IMMO_TYP',
 'KAEUFER',
 'O',
 'ORT',
 'QMPREIS',
 'STRASSE',
 'TERRASSENGROESSE',
 'VERKAEUFER']

In [3]:
from transformers import AutoTokenizer
checkpoint = "flair/ner-german"       # https://huggingface.co/flair/ner-german (1.41GB)
checkpoint = "fhswf/bert_de_ner"      # https://huggingface.co/fhswf/bert_de_ner (419MB)
checkpoint = "bert-base-cased"        # https://huggingface.co/bert-base-cased (416MB)
checkpoint = "bert-base-german-cased" # https://huggingface.co/bert-base-german-cased (419MB)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.is_fast

True

In [4]:
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
batch = data_collator([gNerDataset["train"][i] for i in range(2)])
batch["labels"]

tensor([[-100,    7,    7,    7,    7,    7,    6,    6,    9,    9,    9,    9,
            9,    6,    7,    7,    7,    6,    6,    6,    2,    2,    2,    6,
            6,    6,    6,    6,    6,    6,    4,    4,    4,    4,    6,   10,
           10,   10,    6,    6,    6,    6,    6,    6,    6,    6,    6,    6,
            6,    6,    6,    6,    6,    6,    6,    6,    6,    6,    6,    6,
            6,    6,    6,    6,    6,    6,    6,    6,    6,    6,    6,    6,
            6,    6,    6,    5,    5,    5,    5,    5,    5,    6,    6,    6,
            6,    6,    6,    6,    6,    6,    6,    6,    6,    6,    6,    6,
            6,    8,    8,    8,    8,    6,    6,    6,    6,    6,    6,    6,
            6,    6,    6,    6,    6,    6,    6,    6,    6,    6,    6,    6,
            6,    6,    6,    6,    6,    6,    6,    6,    6,    6,    6,    6,
            6,    6,    6,    6,    6,    6,    1,    1,    1,    1,    6,    6,
            6,    6,    6,  

In [5]:
#from datasets import load_metric
#metric = load_metric("f1")
#train_0_labels = gNerDataset["train"][0]["labels"]
#num_labels = []
#for label in train_0_labels:
#    if label>=0:
#        num_labels.append(label)
#fudged_num_labels = num_labels.copy()
#for i in range(45):
#    fudged_num_labels[i] = 2
#print(num_labels)
#print(fudged_num_labels)
#metric.compute(predictions=fudged_num_labels, references=num_labels, average="macro")

In [6]:
id2label = {str(i): label for i, label in enumerate(ner_tag_names)}
label2id = {v: k for k, v in id2label.items()}
id2label, label2id

({'0': 'DATUM_VERBUECHERUNG',
  '1': 'DATUM_VERTRAG',
  '2': 'FLAECHE',
  '3': 'GESAMTPREIS',
  '4': 'IMMO_TYP',
  '5': 'KAEUFER',
  '6': 'O',
  '7': 'ORT',
  '8': 'QMPREIS',
  '9': 'STRASSE',
  '10': 'TERRASSENGROESSE',
  '11': 'VERKAEUFER'},
 {'DATUM_VERBUECHERUNG': '0',
  'DATUM_VERTRAG': '1',
  'FLAECHE': '2',
  'GESAMTPREIS': '3',
  'IMMO_TYP': '4',
  'KAEUFER': '5',
  'O': '6',
  'ORT': '7',
  'QMPREIS': '8',
  'STRASSE': '9',
  'TERRASSENGROESSE': '10',
  'VERKAEUFER': '11'})

In [8]:
from transformers import AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id
)
model.config.num_labels

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-b

12

In [9]:
# https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.compute_metrics
import numpy as np
from sklearn.metrics import f1_score
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    labels = np.array(labels)
    predictions = np.argmax(logits, axis=-1)
    label_names = ner_tag_names
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels] # list of lists
    true_predictions = [                                                             # list of lists
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    flat_true_labels = [label for true_labels_i in true_labels for label in true_labels_i]                   # list
    flat_true_predictions = [pred for true_predictions_i in true_predictions for pred in true_predictions_i] # list
    macroF1 = f1_score(flat_true_labels, flat_true_predictions, average="macro")
    return {"F1 macro": macroF1}

In [12]:
# training arguments
from transformers import TrainingArguments
args = TrainingArguments(
    f"gNER_{checkpoint}",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    # https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.compute_metrics
    learning_rate=1e-4,
    # https://github.com/huggingface/transformers/blob/main/src/transformers/trainer_utils.py#L356
    lr_scheduler_type="cosine_with_restarts",
    num_train_epochs=1,#40,
    weight_decay=0.005,
    push_to_hub=False
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


Save and reload the best model: https://discuss.huggingface.co/t/save-only-best-model-in-trainer/8442/4

Use trained model for inference: https://discuss.huggingface.co/t/using-trainer-at-inference-time/9378

In [13]:
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=gNerDataset["train"],
    eval_dataset=gNerDataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)
trainer.train()

***** Running training *****
  Num examples = 105
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 28


Epoch,Training Loss,Validation Loss,F1 macro
1,No log,0.585513,0.437183
2,No log,0.438411,0.585334


***** Running Evaluation *****
  Num examples = 35
  Batch size = 8
Saving model checkpoint to gNER_bert-base-german-cased/checkpoint-14
Configuration saved in gNER_bert-base-german-cased/checkpoint-14/config.json
Model weights saved in gNER_bert-base-german-cased/checkpoint-14/pytorch_model.bin
tokenizer config file saved in gNER_bert-base-german-cased/checkpoint-14/tokenizer_config.json
Special tokens file saved in gNER_bert-base-german-cased/checkpoint-14/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 35
  Batch size = 8
Saving model checkpoint to gNER_bert-base-german-cased/checkpoint-28
Configuration saved in gNER_bert-base-german-cased/checkpoint-28/config.json
Model weights saved in gNER_bert-base-german-cased/checkpoint-28/pytorch_model.bin
tokenizer config file saved in gNER_bert-base-german-cased/checkpoint-28/tokenizer_config.json
Special tokens file saved in gNER_bert-base-german-cased/checkpoint-28/special_tokens_map.json


Training completed. Do n

TrainOutput(global_step=28, training_loss=0.8204197202410016, metrics={'train_runtime': 616.942, 'train_samples_per_second': 0.34, 'train_steps_per_second': 0.045, 'total_flos': 20763231241608.0, 'train_loss': 0.8204197202410016, 'epoch': 2.0})

$\checkmark$