# Token Classification
This includes any problem that can be formulated as "attributing a label to each token in a sentence," eg
- Named Entity Recognition (NER)
- Part-of-speech tagging (POS)
- Chunking: finding the tokens that belong to the same entity

In [1]:
# For training in colab

# import os
# import sys


# # Connect to google drive
# from google.colab import drive
# os.chdir("/content")
# drive.mount("/content/gdrive")

# # Load colab_utils funtions
# sys.path.append(f"/content/gdrive/MyDrive/repos/colab-utils")
# import colab_utils

# colab_utils.load_env_vars()
# colab_utils.git_set_config()

# PARENT_FOLDER = "/content/gdrive/MyDrive/repos"
# os.chdir(PARENT_FOLDER)

# git_repo = 'trevorki/huggingface-nlp' # replace with actual values
# colab_utils.git_clone_repo(git_repo)

# REPO_FOLDER = f"{PARENT_FOLDER}/{git_repo.split('/')[1]}"
# os.chdir(REPO_FOLDER)

# # !pip install -r requirements.txt

## Dataset
Use the `CoNLL-2003 dataset`, which contains news stories from Reuters. It contains labels for the three tasks we mentioned earlier: NER, POS, and chunking.

A big difference from other datasets is that the input texts are not presented as sentences or documents, but lists of words (the last column is called tokens, but it contains words in the sense that these are pre-tokenized inputs that still need to go through the tokenizer for subword tokenization).



In [2]:
from datasets import load_dataset

raw_datasets = load_dataset("conll2003")
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [3]:
# these are not tokens, but lists of words
raw_datasets["train"][0]["tokens"]

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

### NER labels

In [None]:
raw_datasets["train"][0]["ner_tags"]

[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [None]:
ner_feature = raw_datasets["train"].features["ner_tags"]
label_names = ner_feature.feature.names
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [None]:
words = raw_datasets["train"][0]["tokens"]
labels = raw_datasets["train"][0]["ner_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = label_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

EU    rejects German call to boycott British lamb . 
B-ORG O       B-MISC O    O  O       B-MISC  O    O 


### POS Labels

In [None]:
raw_datasets["train"][0]["pos_tags"]

[22, 42, 16, 21, 35, 37, 16, 21, 7]

In [None]:
pos_feature = raw_datasets["train"].features["pos_tags"]
pos_names = pos_feature.feature.names
print(pos_names)

['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']


In [None]:
words = raw_datasets["train"][0]["tokens"]
labels = raw_datasets["train"][0]["pos_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = pos_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

EU  rejects German call to boycott British lamb . 
NNP VBZ     JJ     NN   TO VB      JJ      NN   . 


### Chunking labels

In [None]:
raw_datasets["train"][0]["chunk_tags"]

[11, 21, 11, 12, 21, 22, 11, 12, 0]

In [None]:
chunk_feature = raw_datasets["train"].features["chunk_tags"]
chunk_names = chunk_feature.feature.names
print(chunk_names)

['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP']


In [None]:
words = raw_datasets["train"][0]["tokens"]
labels = raw_datasets["train"][0]["chunk_tags"]
line1 = ""
line2 = ""
for word, label in zip(words, labels):
    full_label = chunk_names[label]
    max_length = max(len(word), len(full_label))
    line1 += word + " " * (max_length - len(word) + 1)
    line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

EU   rejects German call to   boycott British lamb . 
B-NP B-VP    B-NP   I-NP B-VP I-VP    B-NP    I-NP O 


# Tokenize
Since the tokenizer splits some words into multiple tokens we have to keed track of the `word_ids` to map them to their words later.

In [4]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [5]:
inputs = tokenizer(raw_datasets["train"][0]["tokens"], is_split_into_words=True)
print(f"tokens: {inputs.tokens()}")
print(f"word_ids: {inputs.word_ids()}")

tokens: ['[CLS]', 'EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'la', '##mb', '.', '[SEP]']
word_ids: [None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]


Since the labels don't have the special characters or the works split into multiple tokens, we must align the tokenized inputs with the labels.
To

In [6]:
print(f"{len(raw_datasets['train'][0]['ner_tags'])} dataset labels:\n\t{raw_datasets['train'][0]['ner_tags']}")
print(f"{len(inputs.tokens())} tokenized inputs:\n\t{inputs.tokens()}")

9 dataset labels:
	[3, 0, 7, 0, 0, 0, 7, 0, 0]
12 tokenized inputs:
	['[CLS]', 'EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'la', '##mb', '.', '[SEP]']


In [7]:
def align_labels_with_tokens(labels, word_ids):
    """Adds items to labels to make them match the length of word_ids, by
    - duplicating the labels for tokens that were split from the same word
    - giving a label of -100 for all special tokens (so that it is ignored by coss entropy loss
    RETURNS:
        list: the new labels"""
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1 # this changes it from B to I
            new_labels.append(label)

    return new_labels

In [8]:
labels = raw_datasets["train"][0]["ner_tags"]
word_ids = inputs.word_ids()
print(labels)
print(align_labels_with_tokens(labels, word_ids))

[3, 0, 7, 0, 0, 0, 7, 0, 0]
[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]


In [9]:
# Tokenize and align labels for whole dataset (We will pad it later)
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"],
                                 truncation=True,
                                 is_split_into_words=True)
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [10]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

# Collating data
We can't use `DataCollatorWith Padding` because we must now pad both the inputs AND the labels, so we must use `DataCollatorForTokenClassification`, which takes the tokenizer as a parameter.

In [11]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer,
                                                   return_tensors="tf")




In [12]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])
batch["labels"].numpy()

array([[-100,    3,    0,    7,    0,    0,    0,    7,    0,    0,    0,
        -100],
       [-100,    1,    2, -100, -100, -100, -100, -100, -100, -100, -100,
        -100]], dtype=int64)

In [13]:
# Compare to just the labels
for i in range(2):
    print(tokenized_datasets["train"][i]["labels"])

[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]
[-100, 1, 2, -100]


# Build TF Dataset

In [14]:
tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=16,
)

tf_eval_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

# The Model
The training of the model was done on GPU in google colab and saved to google drive

In [None]:
!pwd

/content/gdrive/MyDrive/repos/huggingface-nlp


In [None]:
from transformers import TFAutoModelForTokenClassification

# translation dictionaries
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

model = TFAutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)




All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import create_optimizer
import tensorflow as tf

# Train in mixed-precision float16
# Comment this line out if you're using a GPU that will not benefit from this
tf.keras.mixed_precision.set_global_policy("mixed_float16")

# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_epochs = 3
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

Note also that we don’t supply a loss argument to compile(). This is because the models can actually compute loss internally — if you compile without a loss and supply your labels in the input dictionary (as we do in our datasets), then the model will train using that internal loss, which will be appropriate for the task and model type you have chosen.

In [None]:

# from transformers.keras_callbacks import PushToHubCallback

# callback = PushToHubCallback(output_dir="bert-finetuned-ner", tokenizer=tokenizer)

from datetime import datetime
print(datetime.now())

model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    # callbacks=[callback],
    epochs=num_epochs,
    verbose=1
)
print(datetime.now())

2024-02-09 02:02:49.216609
Epoch 1/3

In [None]:
/content/gdrive/MyDrive/repos/huggingface-nlp