##### Copyright 2021 The TensorFlow Authors.

In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# TensorFlow Addons Layers: CRF

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://www.tensorflow.org/addons/tutorials/layers_crf"><img src="https://www.tensorflow.org/images/tf_logo_32px.png" />View on TensorFlow.org</a>
  </td>
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/addons/blob/master/docs/tutorials/layers_crf.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/addons/blob/master/docs/tutorials/layers_crf.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
      <td>
    <a href="https://storage.googleapis.com/tensorflow_docs/addons/docs/tutorials/layers_crf.ipynb"><img src="https://www.tensorflow.org/images/download_logo_32px.png" />Download notebook</a>
  </td>
</table>

## Overview

This notebook will demonstrate how to use the CRF (Conditional Random Field) layer in TensorFlow Addons.

You will learn how to use the CRF layer in two ways by building NER models.

## Setup

In [None]:
!pip install -q tensorflow-addons  # version >= 0.15.0 is required
!pip install -q tensorflow
!pip install -q datasets

In [None]:
import copy

import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
import datasets

## Traning data

Loading the CoNLL 2003 dataset by using the datasets library.

In [None]:
conll_data = datasets.load_dataset("conll2003")

Downloading:   0%|          | 0.00/2.60k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.78k [00:00<?, ?B/s]

Downloading and preparing dataset conll2003/conll2003 (download: 4.63 MiB, generated: 9.78 MiB, post-processed: Unknown size, total: 14.41 MiB) to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6...


  0%|          | 0/3 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/650k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/163k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/146k [00:00<?, ?B/s]

  0%|          | 0/3 [00:00<?, ?it/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset conll2003 downloaded and prepared to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/40e7cb6bcc374f7c349c83acd1e9352a4f09474eb691f64f364ee62eb65d0ca6. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Inspect the data splits and features:

In [None]:
conll_data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

Get a sample of train data and print it out:

In [None]:
for item in conll_data["train"]:
  sample_tokens = item['tokens']
  sample_tag_ids = item["ner_tags"]
  print(sample_tokens)
  print(sample_tag_ids)
  break

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
[3, 0, 7, 0, 0, 0, 7, 0, 0]


For our NER model, the input are the tokens which is a list of strings. The outputs are the NER tags which in the dataset they are the tag ids.

The dataset also give the information about the mapping of NER tags and ids.

In [None]:
dataset_builder = datasets.load_dataset_builder('conll2003')
raw_tags = dataset_builder.info.features['ner_tags'].feature.names
print(raw_tags)

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


Let us decode the NER tag ids to tags.

In [None]:
sample_tags = [raw_tags[i] for i in sample_tag_ids]

print(sample_tokens)
print(sample_tags)

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']


Those tags are used to encode the named entities by some format. In this dataset, tags are encoded in [IOB](https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)) format.

Add a special tag `<PAD>` to the tag set which is used to represent a padding in the sequence. In NLP, 0 is usually used to mark padding. This is the default setting for many functions in Machine Learning software (include TensorFlow).

Create a list to convert tag ids to tag text.

In [None]:
tags = ['<PAD>'] + raw_tags
print(tags)

['<PAD>', 'O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']


Define some constants which will be used in later.

In [None]:
TAG_SIZE = len(tags)
VOCAB_SIZE = 20000

Building vocabulary lookup layer for tokens.

In [None]:
train_tokens = tf.ragged.constant(conll_data["train"]["tokens"])
train_tokens = tf.map_fn(tf.strings.lower, train_tokens)

lookup_layer = tf.keras.layers.StringLookup(max_tokens=VOCAB_SIZE, mask_token="[MASK]", oov_token="[UNK]")
lookup_layer.adapt(train_tokens)

print(len(lookup_layer.get_vocabulary()))
print(lookup_layer.get_vocabulary()[:10])

20000
['[MASK]', '[UNK]', 'the', '.', ',', 'of', 'in', 'to', 'a', 'and']


Creating raw (without preprocess) train and validation dataset.

In [None]:
def create_data_generator(dataset):
  def data_generator():
    for item in dataset:
      yield item['tokens'], item['ner_tags']
  
  return data_generator

data_signature= (
        tf.TensorSpec(shape=(None,), dtype=tf.string),
        tf.TensorSpec(shape=(None, ), dtype=tf.int32)
)

train_data = tf.data.Dataset.from_generator(
    create_data_generator(conll_data["train"]),
    output_signature=data_signature
)

Creating train and validation dataset that can be used for traning and validation.

In [None]:
def dataset_preprocess(tokens, tag_ids):
    preprocessed_tokens = preprecess_tokens(tokens)

    # increase by 1 for all tag_ids,
    # because `<PAD>` is added as the first element in tags list
    preprocessed_tag_ids = tag_ids + 1

    return preprocessed_tokens, preprocessed_tag_ids

def preprecess_tokens(tokens):
    tokens = tf.strings.lower(tokens)
    return lookup_layer(tokens)

BATCH_SIZE = 2048

# With `padded_batch()`, each batch may have different length
# shape: (batch_size, None)
train_dataset = (
    train_data.map(dataset_preprocess)
    .padded_batch(batch_size=BATCH_SIZE).cache()
)

## Method one: Using the CRF layer in a custom training loop

### Creating model

Define BiLSTM+CRF model by using tfa.layers.CRF layer.
The CRF layer not only ouput the CRF decode result (`decode_sequence`), but also outupt some interal variables (`potentials`, `sequence_length` and `kernel`). You will use those internal variables for compute loss value later.

In [None]:
# Build the model
def build_embedding_bilstm_crf_model(
    vocab_size: int, embed_dims: int, lstm_unit: int, tag_size: int
) -> tf.keras.Model:
    x = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="x")
    y = tf.keras.layers.Embedding(vocab_size, embed_dims, mask_zero=True)(x)
    y = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(lstm_unit, return_sequences=True)
    )(y)
    decode_sequence, potentials, sequence_length, kernel = tfa.layers.CRF(tag_size)(y)

    return tf.keras.Model(
        inputs=x, outputs=[decode_sequence, potentials, sequence_length, kernel]
    )


model = build_embedding_bilstm_crf_model(VOCAB_SIZE, 32, 64, TAG_SIZE)


Run the model on a single batch of data, and inspect the output:

In [None]:
# preprocess
preprecessd_tokens = preprecess_tokens(sample_tokens)

# expand the tensor to shape: [1, None]. That is add batch dim
inputs = tf.expand_dims(preprecessd_tokens, axis=0)

outputs, *_ = model(inputs)
print(outputs[0])

tf.Tensor([3 6 5 3 6 5 3 6 5], shape=(9,), dtype=int32)


### Define CRF loss function

By using the real y and some internal variables of the CRF layer. You can compute the log likelihood of real y. Use the negative of log likelihood as the loss to optimize.

In [None]:
@tf.function
def crf_loss_func(potentials, sequence_length, kernel, y):
    crf_likelihood, _ = tfa.text.crf_log_likelihood(
        potentials, y, sequence_length, kernel
    )
    # likelihood to loss
    flat_crf_loss = -1 * crf_likelihood
    crf_loss = tf.reduce_mean(flat_crf_loss)

    return crf_loss

### Define optimizer, metrics and train_step fucntion

In [None]:
optimizer = tf.keras.optimizers.Adam(0.02)

train_loss = tf.keras.metrics.Mean(name="train_loss")

@tf.function(experimental_relax_shapes=True)
def train_step(x, y):
    with tf.GradientTape() as tape:
        decoded_sequence, potentials, sequence_length, kernel = model(x)
        crf_loss = crf_loss_func(potentials, sequence_length, kernel, y)
        loss = crf_loss + tf.reduce_sum(model.losses)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    train_loss(loss)

### Training model

In [None]:
EPOCHS = 10

for epoch in range(EPOCHS):
    # Reset the metrics at the start of the next epoch
    train_loss.reset_states()

    for x, y in train_dataset:
        train_step(x, y)

    print(f"Epoch {epoch + 1}, " f"Loss: {train_loss.result()}")


Epoch 1, Loss: 18.15883445739746
Epoch 2, Loss: 8.863914489746094
Epoch 3, Loss: 5.798288822174072
Epoch 4, Loss: 4.2376203536987305
Epoch 5, Loss: 3.174699068069458
Epoch 6, Loss: 2.0829579830169678
Epoch 7, Loss: 1.3333663940429688
Epoch 8, Loss: 0.9537926912307739
Epoch 9, Loss: 0.7511987090110779
Epoch 10, Loss: 0.6272516250610352


### Making inference

Inspect the predict result.

In [None]:
# print the inputs and expected outputs
print("raw inputs: ", sample_tokens)

# preprocess
preprocessed_inputs = preprecess_tokens(
    sample_tokens
)
# expend the batch dim
inputs = tf.reshape(preprocessed_inputs, shape=[1, -1])

outputs, *_ = model.predict(inputs)
prediction = [tags[i] for i in outputs[0]]

# Keypoint: EU -> B-ORG, German -> B-MISC, British -> B-MISC
print("ground true tags: ", sample_tags)
print("predicted tags: ", prediction)

raw inputs:  ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
ground true tags:  ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']
predicted tags:  ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']


## Method two: Using the CRF layer via CRF model wrapper

### Creating the base model

Define the BiLSTM model as the base model

In [None]:
# Build the model
def build_embedding_bilstm_crf_model(
    vocab_size: int, embed_dims: int, lstm_unit: int
) -> tf.keras.Model:
    x = tf.keras.layers.Input(shape=(None,), dtype=tf.int32, name="x")
    y = tf.keras.layers.Embedding(vocab_size, embed_dims, mask_zero=True)(x)
    y = tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(lstm_unit, return_sequences=True)
    )(y)

    return tf.keras.Model(
        inputs=x, outputs=y
    )


base_model = build_embedding_bilstm_crf_model(VOCAB_SIZE, 32, 64)


Run the model on a single batch of data, and inspect the output:

In [None]:
# preprocess
preprecessd_tokens = preprecess_tokens(sample_tokens)

# expand the tensor to shape: [1, None]. That is add batch dim
inputs = tf.expand_dims(preprecessd_tokens, axis=0)

outputs = base_model(inputs)
print(outputs[0])

tf.Tensor(
[[-7.1890494e-03 -2.7290669e-03  4.7515053e-03 ... -8.8025322e-03
  -4.7782061e-04  2.0253838e-03]
 [ 3.0374343e-03  7.1039307e-05  4.4372356e-03 ... -2.0702709e-03
   8.3299953e-04 -1.6195974e-03]
 [ 7.4774139e-03  2.0081124e-03 -3.7772139e-04 ...  1.1178317e-03
  -1.2569444e-03 -5.9724711e-03]
 ...
 [ 1.4448365e-03  1.6089321e-04  2.8041308e-04 ... -7.1640400e-04
   1.8281980e-03  6.0155179e-04]
 [ 3.3373178e-03 -4.6984632e-03 -1.7297380e-03 ... -2.2621830e-03
  -1.2909365e-03 -3.4608533e-05]
 [ 7.3946593e-03 -4.0191775e-03  3.0159338e-03 ... -3.6100592e-03
  -1.1341731e-03 -2.9479943e-03]], shape=(9, 128), dtype=float32)


### CRF model wrapper

Import CRF model wrapper from TensorFlow Addons

In [None]:
from tensorflow_addons.text.crf_wrapper import CRFModelWrapper

### Wrapper base model with CRF model wrapper

The CRF model wrapper wraps the base model. It will apply the CRF layer to the output of the base model and compute the CRF loss. The wrapper takes the base model and number of tags (for initializing the CRF layer) as the initialization parameters.

In [None]:
model = CRFModelWrapper(base_model, TAG_SIZE)

### Traning model

The compilation of the wrappered model is exactly the same as that of the regular Keras model, except that there is no need to provide a loss function (the wrappered model computes the CRF loss internally).

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(0.02))

In [None]:
model.fit(train_dataset, epochs=10, verbose=2)

Epoch 1/10
7/7 - 13s - loss: 10.5281 - crf_loss: 10.5281
Epoch 2/10
7/7 - 2s - loss: 7.7172 - crf_loss: 7.7172
Epoch 3/10
7/7 - 2s - loss: 5.1699 - crf_loss: 5.1699
Epoch 4/10
7/7 - 1s - loss: 4.0827 - crf_loss: 4.0827
Epoch 5/10
7/7 - 2s - loss: 2.9421 - crf_loss: 2.9421
Epoch 6/10
7/7 - 1s - loss: 1.8043 - crf_loss: 1.8043
Epoch 7/10
7/7 - 1s - loss: 1.2140 - crf_loss: 1.2140
Epoch 8/10
7/7 - 2s - loss: 0.8861 - crf_loss: 0.8861
Epoch 9/10
7/7 - 2s - loss: 0.6535 - crf_loss: 0.6535
Epoch 10/10
7/7 - 1s - loss: 0.5103 - crf_loss: 0.5103


<keras.callbacks.History at 0x7f8639e039d0>

### Making inference

Inspect the predict result.

In [None]:
# print the inputs and expected outputs
print("raw inputs: ", sample_tokens)

# preprocess
preprocessed_inputs = preprecess_tokens(
    sample_tokens
)
# expend the batch dim
inputs = tf.reshape(preprocessed_inputs, shape=[1, -1])

outputs = model.predict(inputs)
prediction = [tags[i] for i in outputs[0]]

# Keypoint: EU -> B-ORG, German -> B-MISC, British -> B-MISC
print("ground true tags: ", sample_tags)
print("predicted tags: ", prediction)

raw inputs:  ['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']
ground true tags:  ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']
predicted tags:  ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']
