In [1]:
from datasets import load_from_disk
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification
import numpy as np
from sklearn.metrics import f1_score
from transformers import TrainingArguments, Trainer


In [2]:
gNerDataset = load_from_disk("gNERdataset")
gNerDataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 78
    })
    valid: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 26
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 36
    })
})

In [3]:
ner_tag_names = ['DATUM_VERBUECHERUNG',
    'DATUM_VERTRAG',
    'FLAECHE',
    'GESAMTPREIS',
    'IMMO_TYP',
    'KAEUFER',
    'O',
    'ORT',
    'QMPREIS',
    'STRASSE',
    'TERRASSENGROESSE',
    'VERKAEUFER'
]
ner_tag_names

['DATUM_VERBUECHERUNG',
 'DATUM_VERTRAG',
 'FLAECHE',
 'GESAMTPREIS',
 'IMMO_TYP',
 'KAEUFER',
 'O',
 'ORT',
 'QMPREIS',
 'STRASSE',
 'TERRASSENGROESSE',
 'VERKAEUFER']

https://discuss.huggingface.co/t/batch-k-torch-tensor-f-k-for-f-in-features-valueerror-expected-sequence-of-length-3-at-dim-1-got-4/1354/4

In [4]:
checkpoint = "flair/ner-german"       # https://huggingface.co/flair/ner-german (1.41GB)
checkpoint = "fhswf/bert_de_ner"      # https://huggingface.co/fhswf/bert_de_ner (419MB)
checkpoint = "bert-base-cased"        # https://huggingface.co/bert-base-cased (416MB)
checkpoint = "bert-base-german-cased" # https://huggingface.co/bert-base-german-cased (419MB)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.is_fast

True

In [5]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
batch = data_collator([gNerDataset["train"][i] for i in range(2)])
batch["labels"]

tensor([[-100,    7,    7,    7,    7,    7,    6,    6,    9,    9,    9,    9,
            9,    9,    6,    7,    7,    7,    6,    6,    6,    2,    2,    2,
            6,    6,    6,    6,    6,    6,    6,    4,    4,    4,    4,    6,
            6,    6,    6,    6,    6,    6,    6,    6,    6,    8,    8,    8,
            8,    8,    6,    6,    6,    6,    6,    6,    6,    6,    6,    6,
            6,    6,    6,    6,    6,    6,    6,    6,    6,    6,    6,    6,
            6,    6,    6,    6,    6,    6,    6,    6,    6,    6,    6,    6,
            6,    6,    6,    6,    6,    6,    6,    6,    6,    6,    6,    6,
            6,    6,    6,    6,    6,    6,   10,   10,   10,    6,    6,    6,
            6,    6,    6,    6,    6,    6,    6,    6,    6,    6,    6,   11,
           11,   11,   11,   11,   11,    6,    6,    6,    6,    6,    6,    6,
            6,    6,    3,    3,    3,    6,    6,    6,    6,    6,    6,    6,
            6,    6,    6,  

In [6]:
#from datasets import load_metric
#metric = load_metric("f1")
#train_0_labels = gNerDataset["train"][0]["labels"]
#num_labels = []
#for label in train_0_labels:
#    if label>=0:
#        num_labels.append(label)
#fudged_num_labels = num_labels.copy()
#for i in range(45):
#    fudged_num_labels[i] = 2
#print(num_labels)
#print(fudged_num_labels)
#metric.compute(predictions=fudged_num_labels, references=num_labels, average="macro")

In [7]:
id2label = {str(i): label for i, label in enumerate(ner_tag_names)}
label2id = {v: k for k, v in id2label.items()}
id2label, label2id

({'0': 'DATUM_VERBUECHERUNG',
  '1': 'DATUM_VERTRAG',
  '2': 'FLAECHE',
  '3': 'GESAMTPREIS',
  '4': 'IMMO_TYP',
  '5': 'KAEUFER',
  '6': 'O',
  '7': 'ORT',
  '8': 'QMPREIS',
  '9': 'STRASSE',
  '10': 'TERRASSENGROESSE',
  '11': 'VERKAEUFER'},
 {'DATUM_VERBUECHERUNG': '0',
  'DATUM_VERTRAG': '1',
  'FLAECHE': '2',
  'GESAMTPREIS': '3',
  'IMMO_TYP': '4',
  'KAEUFER': '5',
  'O': '6',
  'ORT': '7',
  'QMPREIS': '8',
  'STRASSE': '9',
  'TERRASSENGROESSE': '10',
  'VERKAEUFER': '11'})

In [8]:
model = AutoModelForTokenClassification.from_pretrained(
    checkpoint,
    id2label=id2label,
    label2id=label2id
)
model.config.num_labels

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-b

12

In [9]:
# https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.compute_metrics
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    labels = np.array(labels)
    #print(labels.shape)
    predictions = np.argmax(logits, axis=-1)
    label_names = ner_tag_names
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels] # list of lists
    true_predictions = [                                                             # list of lists
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    flat_true_labels = [label for true_labels_i in true_labels for label in true_labels_i]                   # list
    flat_true_predictions = [pred for true_predictions_i in true_predictions for pred in true_predictions_i] # list
    macroF1 = f1_score(flat_true_labels, flat_true_predictions, average="macro")
    return {"F1 macro": macroF1}

## Zero-shot baseline
https://discuss.huggingface.co/t/using-trainer-at-inference-time/9378

In [10]:
# https://huggingface.co/transformers/v4.12.5/main_classes/trainer.html#trainingarguments
output_dir = f"gNER_{checkpoint}"
# using the model without further training
# arguments for Trainer
test_args = TrainingArguments(
    output_dir=output_dir,
    do_train=False,
    do_predict=True,
    dataloader_drop_last=False
)
# init trainer
trainer = Trainer(
    model=model, 
    args=test_args,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=tokenizer
)
test_results = trainer.predict(gNerDataset["valid"])
print(f"ZERO-SHOT BASELINE\nmacro averaged F_1 score:\n{test_results.metrics['test_F1 macro']}")

***** Running Prediction *****
  Num examples = 26
  Batch size = 8


ZERO-SHOT BASELINE
macro averaged F_1 score:
0.01803483295190192


## Training

Use trained model for inference: https://discuss.huggingface.co/t/using-trainer-at-inference-time/9378<br>
Save and reload the best model: https://discuss.huggingface.co/t/save-only-best-model-in-trainer/8442

Hyperparameter search:
- hyperparameter search 1: https://discuss.huggingface.co/t/using-hyperparameter-search-in-trainer/785/10
- hyperparameter search 2: https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Trainer.hyperparameter_search

Load best model at end of training: https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.load_best_model_at_end

Summary of the HF forum [thread](https://discuss.huggingface.co/t/save-only-best-model-in-trainer/8442) on saving and reloading the best model: Use `TrainingArguments` with arguments as listed below:

In [11]:
# training arguments
train_args = TrainingArguments(
    output_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True, # loads checkpoint with lowest loss on validation set
    learning_rate=5e-5,
    # https://github.com/huggingface/transformers/blob/main/src/transformers/trainer_utils.py#L356
    # "linear" "cosine" "cosine_with_restarts" "polynomial", "constant", "constant_with_warmup"
    lr_scheduler_type="cosine",
    num_train_epochs=1, # 15
    weight_decay=0.03,
    push_to_hub=False
)
trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=gNerDataset["train"], # "train"
    eval_dataset=gNerDataset["valid"],  # "valid"
    # https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.compute_metrics
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=tokenizer
)
trainer.train()
# learning rate, weight decay, lr_scheduler_type,   valid F1  valid loss
# 5e-5           0.001         linear               0.868     0.207
# 5e-5           0.001         cosine               0.901     0.192
# 5e-5           0.001         cosine_with_restarts 0.895     0.239
# 5e-5           0.001         polynomial           0.885     0.212
# 5e-5           0.001         constant             0.873     0.195
# 5e-5           0.001         constant_with_warmup 0.875     0.193

# 5e-5           0.005         cosine               0.885     0.208
# 5e-5           0.01          cosine               0.890     0.208
# 5e-5           0.03          cosine               0.894     0.189

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 78
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 10


Epoch,Training Loss,Validation Loss,F1 macro
1,No log,1.334531,0.068033


***** Running Evaluation *****
  Num examples = 26
  Batch size = 8
Saving model checkpoint to gNER_bert-base-german-cased/checkpoint-10
Configuration saved in gNER_bert-base-german-cased/checkpoint-10/config.json
Model weights saved in gNER_bert-base-german-cased/checkpoint-10/pytorch_model.bin
tokenizer config file saved in gNER_bert-base-german-cased/checkpoint-10/tokenizer_config.json
Special tokens file saved in gNER_bert-base-german-cased/checkpoint-10/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from gNER_bert-base-german-cased/checkpoint-10 (score: 1.334531307220459).


TrainOutput(global_step=10, training_loss=1.573901081085205, metrics={'train_runtime': 250.8118, 'train_samples_per_second': 0.311, 'train_steps_per_second': 0.04, 'total_flos': 8669507776848.0, 'train_loss': 1.573901081085205, 'epoch': 1.0})

Resulting score

In [12]:
# https://discuss.huggingface.co/t/using-trainer-at-inference-time/9378/7
test_results = trainer.predict(gNerDataset["valid"]) # use "test" instead of "valid" when done with development
test_results.metrics["test_F1 macro"]

***** Running Prediction *****
  Num examples = 26
  Batch size = 8


0.0680331443523768

In [13]:
# https://discuss.huggingface.co/t/model-inference-on-tokenized-dataset/14820

$\checkmark$