### Token Classification

#### Predicting class label for every token present in the input
Example - POS Tagging,

Transformer library tokenize words into sub-words. But POS tags are provided on per-word-basis.
Tokenizer works on whole strings mbut NER /POS datasets are already tokenized into words/punctuation.
One more preprocessing technique : Data Collator. 
Target will be in sequences now in this task

### How can we tokenize the input if there are already tokenized into words
This is handled in tokenizer class is_split_into_words=True func

## Task : NER

In [3]:
from datasets import load_dataset



In [5]:
data = load_dataset("conll2003")

Downloading builder script:   0%|          | 0.00/9.57k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/12.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14041 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3250 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [7]:
data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [12]:
data['train'].features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'pos_tags': Sequence(feature=ClassLabel(names=['"', "''", '#', '$', '(', ')', ',', '.', ':', '``', 'CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNP', 'NNPS', 'NNS', 'NN|SYM', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB'], id=None), length=-1, id=None),
 'chunk_tags': Sequence(feature=ClassLabel(names=['O', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'B-LST', 'I-LST', 'B-NP', 'I-NP', 'B-PP', 'I-PP', 'B-PRT', 'I-PRT', 'B-SBAR', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-VP', 'I-VP'], id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)}

In [13]:
data['train'].features['ner_tags']

Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC'], id=None), length=-1, id=None)

In [15]:
label_names=data['train'].features['ner_tags'].feature.names

In [16]:
label_names

['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']

In [20]:
from transformers import AutoTokenizer

In [19]:
checkpoint = 'distilbert-base-cased'

In [21]:
tokenizer= AutoTokenizer.from_pretrained(checkpoint)

In [23]:
data['train'][0]['tokens']

['EU', 'rejects', 'German', 'call', 'to', 'boycott', 'British', 'lamb', '.']

### Tokenize the inputs

In [25]:
# Now tokenize the inputs 
idx = 0
t = tokenizer(data['train'][idx]['tokens'], is_split_into_words=True)

In [27]:
t.tokens()


['[CLS]',
 'EU',
 'rejects',
 'German',
 'call',
 'to',
 'boycott',
 'British',
 'la',
 '##mb',
 '.',
 '[SEP]']

In [28]:
t.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 7, 8, None]

### Aligining Targets to Tokens

For any word split into multiple tokens, we assign the same target

Use coding to align the targets to tokens

In [32]:
def align_targets(labels,word_ids):
    aligned_labels = []
    last_word = None
    for word in word_ids:
        if word is None:
            label = -100 # It is a token like [CLS] # Special word token
        elif word != last_word:
            label = labels[word]  # means it is a new word
        else:
            label = labels[word]
            # change b-tags to i-tags
            # if label in begin2inside:
            #     label = begin2inside[label]
        aligned_labels.append(label)
        last_word = word # update last word
    return aligned_labels

In [41]:
# try aligining the targets
labels = data['train'][idx]['ner_tags']
word_ids = t.word_ids()
aligned_targets = align_targets(labels=labels,word_ids=word_ids,)
aligned_targets

[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, 0, -100]

### Mapping tokenizer to Dataset

In [34]:
def tokenize_func(batch):
    tokenized_input = tokenizer(
        batch['tokens'],truncation=True,is_split_into_words=True
    )
    labels_batch = batch['ner_tags'] # original targets
    aligned_labels_batch = [] # aligned targets
    for i,labels in enumerate(labels_batch):
        word_ids = tokenized_input.word_ids(i)
        aligned_labels_batch.append(align_targets(labels,word_ids))
    tokenized_input['labels'] = aligned_labels_batch
    return tokenized_input



In [36]:
tokenized_datasets = data.map(tokenize_func,batched=True,remove_columns=data['train'].column_names)

Map:   0%|          | 0/14041 [00:00<?, ? examples/s]

Map:   0%|          | 0/3250 [00:00<?, ? examples/s]

Map:   0%|          | 0/3453 [00:00<?, ? examples/s]

In [42]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3453
    })
})

### Use DataCollator Now 

In [43]:
from transformers import DataCollatorForTokenClassification

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [48]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [49]:
tokenized_datasets['train']

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 14041
})

In [50]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(2)])

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


### Compute Metric For Token Classification Tasks

In [53]:
from datasets import load_metric

In [54]:
metric = load_metric("seqeval")

  metric = load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [55]:
import numpy as np

In [85]:
def compute_metric(logits_and_labels):
    logits,labels = logits_and_labels
    preds = np.argmax(logits,axis=-1)

    # remove -100 from labels and predictions
    # convert label ids to label names
    str_labels = [
        [label_names[t] for t in label if t!=-100] for label in labels
    ]

    str_preds = [
        [label_names[p] for p,t in zip(pred,targ) if t!=-100] for pred,targ in zip(preds,labels)
    ]

    the_metrics = metric.compute(predictions=str_preds,references=str_labels)

    return {
        "precision": the_metrics['overall_precision'],
        "recall": the_metrics['overall_recall'],
        "f1":the_metrics["overall_f1"],
        "accuracy": the_metrics['overall_accuracy'],
        
    }

In [86]:
id2label={k:v for k,v in enumerate(label_names)}
label2id = {v:k for k,v in id2label.items()}

In [87]:
from transformers import AutoModelForTokenClassification

In [88]:
model = AutoModelForTokenClassification.from_pretrained(checkpoint,
                                                        id2label=id2label,
                                                        label2id=label2id)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [89]:
from transformers import TrainingArguments,Trainer

In [90]:
training_args = TrainingArguments(
    output_dir="distilbert-finetuned-ner",
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    num_train_epochs=1,
    weight_decay=0.01

)

In [91]:
trainer = Trainer(
    args=training_args,
    model=model,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metric
)

In [93]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.049,0.093633,0.855654,0.868355,0.861958,0.96007


TrainOutput(global_step=1756, training_loss=0.08058667780326548, metrics={'train_runtime': 536.7242, 'train_samples_per_second': 26.161, 'train_steps_per_second': 3.272, 'total_flos': 154190936839842.0, 'train_loss': 0.08058667780326548, 'epoch': 1.0})

In [None]:
trainer.save_model("ner_model_v0.1")