In [1]:
%pip install datasets==2.12.0

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from datasets import load_dataset
from collections import Counter
from conlleval import evaluate
import tensorflow_hub as hub
import tensorflow_text as text


  from .autonotebook import tqdm as notebook_tqdm


## Bert Model

In [3]:
tfhub_handle_encoder="https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1",
tfhub_handle_preprocess="https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

2023-06-08 17:44:17.492992: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-06-08 17:44:17.633435: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-06-08 17:44:17.635191: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-06-08 17:44:17.659187: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [4]:
class NERModelBert(keras.Model):
    def __init__(self, 
                 num_tags,
                 dropout_rate=0.1, 
                 tfhub_handle_encoder="https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1",
                 tfhub_handle_preprocess="https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3",
                ):
        super().__init__()
        # text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name="text")
        self.preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name="preprocessing")
        self.encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name="BERT_encoder")
        self.dropout = layers.Dropout(dropout_rate)
        self.ff_final = layers.Dense(num_tags, activation="softmax")                                                  
    
    def call(self, text_input):
        encoder_inputs = self.preprocessing_layer(text_input)
        outputs = self.encoder(encoder_inputs)
        net = outputs["sequence_output"]
        net = self.dropout(net)
        net = self.ff_final(net)
        return net




## Load the CoNLL 2003 dataset from the datasets library and process it


In [5]:
conll_data = load_dataset("conll2003")


100%|██████████| 3/3 [00:00<00:00,  7.49it/s]


We will export this data to a tab-separated file format which will be easy to read as a
`tf.data.Dataset` object.

In [6]:
def export_to_file(export_file_path, data):
    with open(export_file_path, "w") as f:
        for record in data:
            ner_tags = record["ner_tags"]
            N = 128
            ner_tags += ['-1'] * (N - len(ner_tags))
            
            tokens = record["tokens"]
            if len(tokens) > 0:
                f.write(
                    " ".join(tokens)
                    + "\t"
                    + "\t".join(map(str, ner_tags))
                    + "\n"
                )

In [8]:
os.mkdir("data1")
export_to_file("./data1/conll_train.txt", conll_data["train"])
export_to_file("./data1/conll_val.txt", conll_data["validation"])

In [9]:
train_data = tf.data.TextLineDataset("./data1/conll_train.txt")
val_data = tf.data.TextLineDataset("./data1/conll_val.txt")

In [10]:
print(list(train_data.take(1).as_numpy_iterator()))


[b'EU rejects German call to boycott British lamb .\t3\t0\t7\t0\t0\t0\t7\t0\t0\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1\t-1']


In [15]:
def make_tag_lookup_table():
    iob_labels = ["B", "I"]
    ner_labels = ["PER", "ORG", "LOC", "MISC"]
    all_labels = [(label1, label2) for label2 in ner_labels for label1 in iob_labels]
    all_labels = ["-".join([a, b]) for a, b in all_labels]
    all_labels = ["[PAD]", "O"] + all_labels
    return dict(zip(range(0, len(all_labels) + 1), all_labels))

In [16]:
mapping = make_tag_lookup_table()
print(mapping)
num_tags = len(mapping)

{0: '[PAD]', 1: 'O', 2: 'B-PER', 3: 'I-PER', 4: 'B-ORG', 5: 'I-ORG', 6: 'B-LOC', 7: 'I-LOC', 8: 'B-MISC', 9: 'I-MISC'}


We will be using the following map function to transform the data in the dataset:


In [9]:
def map_record_to_training_data_with_bert(record_input):
    records = tf.strings.split(record_input, sep="\t")
    input = records[0:1][0]
    tags = records[1 :]
    tags = tf.strings.to_number(tags, out_type=tf.int32)
    tags += 1
    return input, tags

In [11]:
batch_size = 32

In [12]:
train_dataset = (
    train_data.map(map_record_to_training_data_with_bert)
    .batch(batch_size)
)

In [13]:
val_dataset = (
    val_data.map(map_record_to_training_data_with_bert)
    .batch(batch_size)
)

In [17]:
ner_model_bert = NERModelBert(num_tags)

We will be using a custom loss function that will ignore the loss from padded tokens.

In [254]:
class CustomNonPaddingTokenLoss(keras.losses.Loss):
    def __init__(self, name="custom_ner_loss"):
        super().__init__(name=name)
    
    def call(self, y_true, y_pred):
        loss_fn = keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, reduction=keras.losses.Reduction.NONE
        )
        loss = loss_fn(y_true, y_pred)
        mask = tf.cast((y_true > 0), dtype=tf.float32)
        loss = loss * mask
        return tf.reduce_sum(loss) / tf.reduce_sum(mask)
    

In [255]:
loss = CustomNonPaddingTokenLoss()

## Compile and fit the model

In [256]:
ner_model_bert.compile(optimizer="adam", loss=loss)
ner_model_bert.fit(train_dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f60df5d7490>

Sample inference using the trained model

In [243]:
sample_input = ["eu rejects german call to boycott british lamb"]
sample_output = [2,1,2,0,0,]

In [244]:
output = ner_model_bert.predict(sample_input)
prediction = np.argmax(output, axis=-1)[0]
prediction = [mapping[i] for i in prediction]

In [245]:
print(prediction)

['[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', 

In [230]:
output

array([[[5.8023623e-05, 6.4230782e-01, 5.8863591e-02, ...,
         8.1768315e-03, 2.4920182e-02, 5.8613606e-03],
        [4.3413165e-05, 7.6970142e-01, 4.0450260e-02, ...,
         5.3748270e-03, 2.3849668e-02, 4.4001639e-03],
        [3.4169167e-05, 8.3732164e-01, 2.9337905e-02, ...,
         3.8941987e-03, 2.0997340e-02, 3.3079602e-03],
        ...,
        [3.4174427e-05, 8.3730841e-01, 2.9340724e-02, ...,
         3.8945919e-03, 2.0996951e-02, 3.3082303e-03],
        [3.4176028e-05, 8.3731008e-01, 2.9340776e-02, ...,
         3.8945868e-03, 2.0996531e-02, 3.3083595e-03],
        [3.4178611e-05, 8.3728075e-01, 2.9345147e-02, ...,
         3.8951596e-03, 2.0998575e-02, 3.3087248e-03]]], dtype=float32)

In [225]:
ner_model_bert.compute_loss()

TypeError: 'NoneType' object is not callable

## Metrics calculation

Here is a function to calculate the metrics. The function calculates F1 score for the
overall NER dataset as well as individual scores for each NER tag.

In [39]:
def calculate_metrics(dataset):
    all_true_tag_ids, all_predicted_tag_ids = [], []

    for x, y in dataset:
        output = ner_model_bert.predict(x)
        predictions = np.argmax(output, axis=-1)
        predictions = np.reshape(predictions, [-1])

        true_tag_ids = np.reshape(y, [-1])

        mask = (true_tag_ids > 0) & (predictions > 0)
        true_tag_ids = true_tag_ids[mask]
        predicted_tag_ids = predictions[mask]

        all_true_tag_ids.append(true_tag_ids)
        all_predicted_tag_ids.append(predicted_tag_ids)

    all_true_tag_ids = np.concatenate(all_true_tag_ids)
    all_predicted_tag_ids = np.concatenate(all_predicted_tag_ids)

    predicted_tags = [mapping[tag] for tag in all_predicted_tag_ids]
    real_tags = [mapping[tag] for tag in all_true_tag_ids]

    evaluate(real_tags, predicted_tags)

In [38]:
calculate_metrics(val_dataset)

processed 51362 tokens with 5942 phrases; found: 5295 phrases; correct: 3855.
accuracy:  62.69%; (non-O)
accuracy:  93.39%; precision:  72.80%; recall:  64.88%; FB1:  68.61
              LOC: precision:  83.45%; recall:  79.86%; FB1:  81.61  1758
             MISC: precision:  74.45%; recall:  65.73%; FB1:  69.82  814
              ORG: precision:  65.34%; recall:  61.00%; FB1:  63.09  1252
              PER: precision:  65.53%; recall:  52.33%; FB1:  58.19  1471


In [40]:
calculate_metrics(val_dataset)

processed 51362 tokens with 5942 phrases; found: 5947 phrases; correct: 4056.
accuracy:  65.99%; (non-O)
accuracy:  93.18%; precision:  68.20%; recall:  68.26%; FB1:  68.23
              LOC: precision:  74.17%; recall:  83.94%; FB1:  78.75  2079
             MISC: precision:  75.56%; recall:  61.71%; FB1:  67.94  753
              ORG: precision:  58.39%; recall:  61.74%; FB1:  60.02  1418
              PER: precision:  65.82%; recall:  60.64%; FB1:  63.13  1697


## Conclusions

In this exercise, we created a simple transformer based named entity recognition model.
We trained it on the CoNLL 2003 shared task data and got an overall F1 score of around 70%.
State of the art NER models fine-tuned on pretrained models such as BERT or ELECTRA can easily
get much higher F1 score -between 90-95% on this dataset owing to the inherent knowledge
of words as part of the pretraining process and the usage of subword tokenization.

You can use the trained model hosted on [Hugging Face Hub](https://huggingface.co/keras-io/ner-with-transformers)
and try the demo on [Hugging Face Spaces](https://huggingface.co/spaces/keras-io/ner_with_transformers)."""


In [41]:
tf.keras.utils.plot_model(ner_model, to_file="a.png", show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


In [42]:
vocab_size

20000

In [43]:
num_tags

10

In [233]:
def map_record_to_training_data_with_bert(record_input):
    records = tf.strings.split(record_input, sep="\t")
    input = records[0:1][0]
    tags = records[1 :]
    tags = tf.strings.to_number(tags, out_type=tf.float32)
    tags += 1
    tags /=10
    return input, tags

def m2(inp1,inp2):
    return bert_preprocess_model(inp1),inp2

In [234]:
# for x,y in train_data.map(map_record_to_training_data_with_bert).batch(4).take(2).as_numpy_iterator():
#     print(y)
    
for x,y in train_data.map(map_record_to_training_data_with_bert).batch(4).map(m2).take(1).as_numpy_iterator():
    print(len(x['input_word_ids'][0]))
    print(y)

128
[[0.4 0.1 0.8 0.1 0.1 0.1 0.8 0.1 0.1 0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0. ]
 [0.2 0.3 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.

In [None]:
for x  in train_data.map(map_record_to_training_data_with_bert).batch(1).take(2).as_numpy_iterator():
    print(x)

In [63]:
for x  in train_data.take(1).as_numpy_iterator():
    print(x)

b'9\tEU\trejects\tGerman\tcall\tto\tboycott\tBritish\tlamb\t.\t3\t0\t7\t0\t0\t0\t7\t0\t0'


In [99]:
value = tf.constant([[[b'a',b'b']]])
split = tf.split(value, num_or_size_splits = value.shape[1], axis = 1)
string = tf.strings.join(split," ")

In [100]:
string

<tf.Tensor: shape=(1, 1, 2), dtype=string, numpy=array([[[b'a', b'b']]], dtype=object)>

In [60]:
N = 5
a = [1]
b = map(lambda x, y: y if x is None else x, a, ['']*N)


In [61]:
b

<map at 0x7f64a788c4d0>

In [62]:
a += [''] * (N - len(a))

In [63]:
a

[1, '', '', '', '']