In [1]:
import json

from datasets import Dataset
from transformers import DataCollatorForTokenClassification
from extr_ds.manager.utils.filesystem import load_document
import numpy
import evaluate
from transformers.keras_callbacks import KerasMetricCallback
import tensorflow as tf
from transformers import AutoTokenizer, \
    TFAutoModelForTokenClassification
from transformers import pipeline

import warnings
warnings.filterwarnings('ignore')



In [2]:
epochs = 3
model_checkpoint = 'bert-base-cased'
model_output_checkpoint = 'transformers/nfl_pbp_token_classifier'

labels = [
    "abbr",
    "orth",
    "MorfDef",
    "gramGrp",
    "label",
    "citRange",
    "quote",
    "citedRange",
    "hi",
    "RegDef",
    "def",
    "cit",
    "term",
    "norm",
    "form",
    "bibl",
    "usg"
]

label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for i, label in enumerate(labels)}

In [3]:
def align_labels(tokenized_inputs, label_list):
    labels = []
    for word_idx in tokenized_inputs.word_ids(batch_index=0):
        label_id = -100
        if word_idx is not None:
            label = label_list[word_idx]

            label_id = label2id[label]

        labels.append(label_id)
        # previous_word_idx = word_idx

    return labels


def get_dataset(tokenizer, model):
    def tokenize_and_align_labels(record):
        tokenized_inputs = tokenizer(
            record['tokens'],
            truncation=True,
            is_split_into_words=True
        )

        tokenized_inputs['labels'] = align_labels(
            tokenized_inputs,
            record['labels']
        )

        return tokenized_inputs

    ents_dataset = json.loads(
        load_document('dataset/dataset.json')
    )

    # random.shuffle(ents_dataset)

    pivot = int(len(ents_dataset) * .8)
    data_collator = DataCollatorForTokenClassification(
        tokenizer,
        return_tensors='tf'
    )

    train_dataset = Dataset.from_list(ents_dataset[:pivot])
    tf_train_set = model.prepare_tf_dataset(
        train_dataset.map(
            tokenize_and_align_labels,
            batched=False
        ),
        shuffle=True,
        collate_fn=data_collator,
    )

    test_dataset = Dataset.from_list(ents_dataset[pivot:])
    tf_test_set = model.prepare_tf_dataset(
        test_dataset.map(
            tokenize_and_align_labels,
            batched=False
        ),
        shuffle=True,
        collate_fn=data_collator,
    )

    return tf_train_set, tf_test_set


seqeval = evaluate.load('seqeval')


def compute_metrics(preds):
    predictions, actuals = preds
    predictions = numpy.argmax(predictions, axis=2)

    results = seqeval.compute(
        predictions=[
            [labels[p] for p, l in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, actuals)
        ],
        references=[
            [labels[l] for p, l in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, actuals)
        ]
    )

    return {
        key: results[f'overall_{key}']
        for key in ['precision', 'recall', 'f1', 'accuracy']
    }

In [4]:
tokenizer = AutoTokenizer.from_pretrained(
    model_checkpoint
)

model = TFAutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id
)

tf_train_set, tf_test_set = get_dataset(tokenizer, model)

callbacks = [
    KerasMetricCallback(
        metric_fn=compute_metrics,
        eval_dataset=tf_test_set
    ),
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
]

optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
model.compile(optimizer=optimizer)



All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/529 [00:00<?, ? examples/s]

Map:   0%|          | 0/133 [00:00<?, ? examples/s]

In [5]:
model.fit(
    x=tf_train_set,
    validation_data=tf_test_set,
    epochs=epochs,
    callbacks=callbacks
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x1828308aa40>

In [6]:
for model_to_save in [tokenizer, model]:
  model_to_save.save_pretrained(model_output_checkpoint)

In [7]:
classifier = pipeline(
    'ner',
    model=model_output_checkpoint,
    aggregation_strategy='simple'
)

examples = [
  'DISFAVORITOR, -OÂRE adj. (învechit, rar) Care defavorizează. A tractarisi şi a-şi dobândi oarecare tocmele mai puţin disfavoritoare. AR (1829), 1071/34.     - Pl.: disfavoritori, -oare. — De la disfavoare.',
]

responses = classifier(examples)
print(responses)

Some layers from the model checkpoint at transformers/nfl_pbp_token_classifier were not used when initializing TFBertForTokenClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForTokenClassification were initialized from the model checkpoint at transformers/nfl_pbp_token_classifier.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForTokenClassification for predictions without further training.


[[{'entity_group': 'orth', 'score': 0.98880124, 'word': 'DISFAVORITOR, - OÂRE', 'start': 0, 'end': 19}, {'entity_group': 'gramGrp', 'score': 0.9227743, 'word': 'adj', 'start': 20, 'end': 23}, {'entity_group': 'MorfDef', 'score': 0.9345763, 'word': '.', 'start': 23, 'end': 24}, {'entity_group': 'usg', 'score': 0.9205247, 'word': '( învechit, rar )', 'start': 25, 'end': 40}, {'entity_group': 'RegDef', 'score': 0.80014676, 'word': 'Care defavorizează. A tractarisi şi a', 'start': 41, 'end': 78}, {'entity_group': 'quote', 'score': 0.35433328, 'word': '-', 'start': 78, 'end': 79}, {'entity_group': 'RegDef', 'score': 0.49321067, 'word': 'şi dobândi oarecare tocmele mai puţin disfavoritoare.', 'start': 79, 'end': 132}, {'entity_group': 'abbr', 'score': 0.89284194, 'word': 'AR ( 1829 ), 1071 / 34.', 'start': 133, 'end': 152}, {'entity_group': 'form', 'score': 0.90721315, 'word': '- Pl. : dis', 'start': 157, 'end': 167}, {'entity_group': 'hi', 'score': 0.72690976, 'word': '##favoritori,', 'star