# Named-Entity Recognition (NER) - Fine-tunning BERT
In this notebook we'll take a look at the process of fine-tuning [DistilBERT](https://huggingface.co/distilbert-base-multilingual-cased) model to recognize people, organisations, and locations in text. This will be done using the [conll2003](https://huggingface.co/datasets/conll2003) dataset. The tecniques discussed apply to general NER applications.

First things first, let's make sure we have a GPU instance in this Colab session:
*   `Edit -> Notebook settings -> Hardware accelerator` must be set to **GPU**.
*   if needed, reinitiliaze the session by clicking **Connect** in top right corner.

After the session is initilized, we can check our assigned GPU with the following command:

In [None]:
!nvidia-smi

Install the Transformers, Datasets, Evaluate and seqeval libraries.

In [None]:
%%capture
!pip install datasets evaluate transformers[sentencepiece] seqeval

# Dataset

In [2]:
from datasets import load_dataset

In [None]:
raw_datasets = load_dataset("conll2003")

In [None]:
raw_datasets

In [None]:
raw_datasets["train"][50]["tokens"]

In [None]:
raw_datasets["train"].features["pos_tags"]

In [8]:
label_names = raw_datasets["train"].features["ner_tags"].feature.names

Preprocessing (Tokenization and alignment)

In [9]:
from transformers import AutoTokenizer

model_checkpoint = "distilbert-base-multilingual-cased"

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
inputs = tokenizer(raw_datasets["train"][50]["tokens"], is_split_into_words=True)
inputs

In [13]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [14]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

In [None]:
tokenized_datasets

In [None]:
tokenized_datasets["train"][0]["labels"]

# Fine-tunning (Training)

In [18]:
import evaluate

import numpy as np

from transformers import DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [19]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

In [21]:
args = TrainingArguments(
    "distilbert-multilingual-cased-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=32,
    weight_decay=0.01,
)

In [22]:
metric = evaluate.load("seqeval")

In [23]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [24]:


trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

# Evaluation

In [None]:
predictions = trainer.predict(tokenized_datasets["validation"])

In [None]:
compute_metrics([predictions.predictions, predictions.label_ids])

# Example

In [44]:
# filler code
import torch

from termcolor import colored

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def align_predictions_and_labels(predictions, labels, id2label):
  alligned = []
  prev_lab = -1
  for i, label in enumerate(labels):
    if label == None:
      continue
    if label == prev_lab:
      continue
    else:
      alligned.append(id2label[predictions[i]])
      prev_lab = label

  return alligned

def show_me_some_tags(text, show_legend=True):
  model.eval()
  words = text.split()
  tokenized_inputs = tokenizer(words, truncation=True, is_split_into_words=True)
  input_ids = torch.tensor([tokenized_inputs["input_ids"]]).to(device)
  attention_mask = torch.tensor([tokenized_inputs["attention_mask"]]).to(device)
  with torch.no_grad():
    logits = model(input_ids, attention_mask).logits

  predictions = np.argmax(logits.cpu().numpy(), axis=2)
  predictions = align_predictions_and_labels(predictions[0], tokenized_inputs.word_ids(), id2label)

  marked_words = []
  for i in range(len(words)):
    if predictions[i] == "O":
      marked_words.append(words[i]) 
    else:
      marked_words.append("_".join([words[i], predictions[i].split("-")[1]])) 
  print(" ".join(marked_words))

In [None]:
show_me_some_tags("My name is Luka. I live in Ljubljana.")

In [None]:
show_me_some_tags("And this is Andrej. He likes to visit the Faculty of Computer and Infromation Science in Ljubljana")

In [None]:
show_me_some_tags("Košarkarji Dallasa v letošnji sezoni veliko bolje igrajo proti močnim tekmecem. LA Clippersi so stari znanci in motiva zagovoto ne bo manjkalo.")

In [None]:
show_me_some_tags("Luka Dončić je najboljši strelec lige s povprečjem 34,3 točke na tekmo. Ko ima Ljubljančan pravi strelski večer, je Maverickse zelo težko premagati.")