In [23]:
from datasets import get_dataset_config_names, load_dataset

xtreme_subsets = get_dataset_config_names("xtreme")
len(xtreme_subsets)

183

In [24]:
panx_subsets = [s for s in xtreme_subsets if s.startswith("PAN")]
panx_subsets[:3]

['PAN-X.af', 'PAN-X.ar', 'PAN-X.bg']

In [25]:
# the datasets have a two letter suffix matching IOS langauge codes
# to make a realistic Swiss setup we will need de, fr, it, en

from collections import defaultdict
from datasets import DatasetDict

langs = ["de", "fr", "it", "en"]
fracs = [0.629, 0.229, 0.084, 0.059]
panx_ch = defaultdict(DatasetDict)

for lang, frac in zip(langs, fracs):
    ds = load_dataset("xtreme", name=f"PAN-X.{lang}")
    for split in ds:
        panx_ch[lang][split] = ds[split].shuffle(seed=0).select(range(int(frac * ds[split].num_rows)))

In [26]:
import pandas as pd

pd.DataFrame({lang: [panx_ch[lang]["train"].num_rows] for lang in langs}, index =["Number of training examples"])

Unnamed: 0,de,fr,it,en
Number of training examples,12580,4580,1680,1180


In [27]:
# By design we will have more examples in German, we will use it to to start perform zero-shot cross lingual transfer to French, Italian and English. 
# Let's examine the data for German.

element = panx_ch["de"]["train"][0]
for key, value in element.items():
    print(f"{key}: {value}")

tokens: ['2.000', 'Einwohnern', 'an', 'der', 'Danziger', 'Bucht', 'in', 'der', 'polnischen', 'Woiwodschaft', 'Pommern', '.']
ner_tags: [0, 0, 0, 0, 5, 6, 0, 0, 5, 5, 6, 0]
langs: ['de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de']


In [28]:
# this is a bit cryptic..

for key, value in panx_ch["de"]["train"].features.items():
    print(f"{key}: {value}")

tags = panx_ch["de"]["train"].features["ner_tags"].feature
print(tags)

def create_tag_names(batch):
    return {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]}

panx_de = panx_ch["de"].map(create_tag_names)

de_example = panx_de["train"][0]
pd.DataFrame([de_example["tokens"], de_example["ner_tags_str"]], ['Tokens', 'Tags'])

tokens: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
ner_tags: Sequence(feature=ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)
langs: Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
Tokens,2.000,Einwohnern,an,der,Danziger,Bucht,in,der,polnischen,Woiwodschaft,Pommern,.
Tags,O,O,O,O,B-LOC,I-LOC,O,O,B-LOC,B-LOC,I-LOC,O


In [29]:
# quick check if there aren't any unusual imbalance in the tags, let's calculate frequencies of each entitiy acros each split

from collections import Counter

split2freqs = defaultdict(Counter)
for split, dataset in panx_de.items():
    for row in dataset["ner_tags_str"]:
        for tag in row:
            if tag.startswith("B"):
                  tag_type = tag.split("-")[1]
                  split2freqs[split][tag_type] += 1
pd.DataFrame.from_dict(split2freqs, orient="index")

# Below is looking good, frequencies are similar across splits.

Unnamed: 0,LOC,ORG,PER
train,6186,5366,5810
validation,3172,2683,2893
test,3180,2573,3071


### Mutlilingual transformers
They involve similar arch and training vs monolingual except the corpus of training is in many languages. Remarkable thing is that without explicit information to differentiate among languages the resulting linguistic representations are able to generalize well across languages for a variety of downstream tasks.

### Tokenization

tokenizer called SentencePiece was trained on raw text +100 languages, let's see how it compares to WordPiece

In [30]:
from transformers import AutoTokenizer

bert_model_name = "bert-base-cased"
xlmr_model_name = "xlm-roberta-base"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)

text =  "Jack Sparrow loves New York!"
bert_tokens = bert_tokenizer(text).tokens()
xlmr_tokens = xlmr_tokenizer(text).tokens()

print(f"BERT tokens: {bert_tokens}")
print(f"XLMR tokens: {xlmr_tokens}")

BERT tokens: ['[CLS]', 'Jack', 'Spa', '##rrow', 'loves', 'New', 'York', '!', '[SEP]']
XLMR tokens: ['<s>', '▁Jack', '▁Spar', 'row', '▁love', 's', '▁New', '▁York', '!', '</s>']


### The Tokenizer Pipeline
So far, we treated tokenization as a single operation that transforms string to integers. It's a bit more complex than that.
 - Normarlization: make the string cleaner
 - pretokinization: splits text into smaller objects
 - tokenizer model: applies subword splitting model on the words (this is the part that needs to be trained on your corpus)
 - postprocessing: some additional transformations can be applied on the list of tokens. eg adding special tokens begin/end

### SentencePiece tokenizer

based on type of subword segmentation called Unigram and encodes each input text as a sequence of Unicode characters.

## Transformers for Named Entity Recognition

Named Entity Recoginition (NER) is often framed as token classification task. 

## The anatomy of transforers model class

### Bodies and heads

The main concept behind huggingface transformers is the split of the architecture into a body and head. We have already seen that when we switch fom the pretraining task to the downstream task, we need to replace the last layer from the model with one that is suitable for the task. This last layer is called the model head; it's the part that is task-specific. The rest of the model is called body. it includes the token embeddings and transformer layers that are task-agnostic. this structure is implemtned in the hf tranformers code: the body of the model is implemented in a class as BertModel or GPT2Model that returns the hidden states of the last layer. Task-specififc models such as BertForMaskedLM or BertForSequenceClassification use the base model and add the necessary head on top of the hidden stats. 

### Creating a Custom Model for Token Classification

An exercise of building a custom token classification head for XLMR. Since XLM-R uses the same arch as RoBERTa, we will use RoBERTa as a base model, but augmented with settings specfific to XLM-R. (note that XLMRoberaForTokenClassification already exists IRL). 


In [31]:
import torch.nn as nn
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel, RobertaModel

class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.init_weights()
    
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        labels=None,
        **kwargs
    ):
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            **kwargs
        )
        sequence_output = self.dropout(outputs[0])
        logits = self.classifier(sequence_output)
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

### Loading a custom model

By just implementing two functions avoe we can build our own custom transformer model. Since we inherit va PreTrainedModel, we instantly get access to all te usefull hf transformer apis like from_pretrained. 

We are now ready to load our token classification model. 

In [32]:
# Next to model name we need additional infromation that include the tags we will use for label each entitity and the mappign of each tag to an id and vice versa
index2tag = {idx: tag for idx, tag in enumerate(tags.names)}
tag2index = {tag: idx for idx, tag in enumerate(tags.names)}

In [33]:
from transformers import AutoConfig
import torch

# AutoConfig is the blueprint of the model arch. It contains all the hyperparameters required to build the model.
# We can however modify some of the hyperparameters, such as the number of labels, to match our task.
xlmr_config = AutoConfig.from_pretrained(xlmr_model_name, num_labels=tags.num_classes, id2label=index2tag, label2id=tag2index)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
xlmr_model = (XLMRobertaForTokenClassification.from_pretrained(xlmr_model_name, config=xlmr_config).to(device))

# A quick check that we intiatlied the tokenizer and model correctly
input_ids = xlmr_tokenizer.encode(text, return_tensors="pt")
pd.DataFrame([xlmr_tokens, input_ids[0].numpy()], index = ["Tokens", "Input IDs"])

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Tokens,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>
Input IDs,0,21763,37456,15555,5161,7,2356,5753,38,2


In [34]:
# We need to pass the inputs to the model and extract the predictions by taking the argmax to get the most lickely class per token:

outputs = xlmr_model(input_ids.to(device)).logits
predictions = torch.argmax(outputs, dim=-1)
print("Here we see that the logits have the shape [batch_size, num_tokens, num_tags], with each token given a logit among the seven possible NER tags.")
print(f"Number of tokens in sequence: {len(xlmr_tokens)}")
print(f"Shape of outputs: {outputs.shape}")

Here we see that the logits have the shape [batch_size, num_tokens, num_tags], with each token given a logit among the seven possible NER tags.
Number of tokens in sequence: 10
Shape of outputs: torch.Size([1, 10, 7])


In [35]:
# by enumerating the predictions we can see that the model correctly predicts the entitity type for each token
preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
pd.DataFrame([xlmr_tokens, preds], index = ["Tokens", "Tags"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Tokens,<s>,▁Jack,▁Spar,row,▁love,s,▁New,▁York,!,</s>
Tags,B-LOC,B-LOC,B-LOC,B-LOC,B-LOC,B-LOC,B-LOC,B-LOC,B-LOC,B-LOC


In [36]:
# Our token classification with randm weights leaves a lot to be desired! Let's fine-tune it on some labeled data to make it better.
# Before continouing let's wrap previous steps into a function for later use.

def tag_text(text, tags, model, tokenizer):
    tokens = tokenizer(text).tokens()
    input_ids = xlmr_tokenizer(text, return_tensors="pt").input_ids.to(device)
    outputs = model(input_ids)[0]
    predictions = torch.argmax(outputs, dim=2)
    preds = [tags.names[p] for p in predictions[0].cpu().numpy()]
    return pd.DataFrame([tokens, preds], index = ["Tokens", "Tags"])

# Before we can train the model, we also need to tokenize the inputs and prepare the labels

In [37]:
# Tokeninzing texts for NER

# let's capture our dataset as ordinary lists following HF documentation

words, labels = de_example["tokens"], de_example["ner_tags"]

# Now we tokenize each word and use the is_split_into_words argument to ensure that the labels are aligned with the tokens
tokenized_input = xlmr_tokenizer(de_example["tokens"], is_split_into_words=True)
tokens = xlmr_tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
pd.DataFrame([tokens], index = ["Tokens"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>


In [38]:
# In above example we se Einwohnern split into 2 parts, since we follow the convetntion of start tokens (B-LOC label), we need to mask the subword representations after the first subword.
word_ids = tokenized_input.word_ids()
pd.DataFrame([tokens, word_ids], index=["Tokens", "Word IDs"])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>
Word IDs,,0,1,1,2,3,4,4,4,5,...,9,9,9,9,10,10,10,11,11,


In [39]:
# Above we can see that it mapped the words correctly together and that special tokens are mapped to None.
# Let's set -100 as the label for these special tokens and the subwords we wish to mask during training.

previous_word_idx = None
labels_ids = []
for word_idx in word_ids:
    if word_idx is None or word_idx == previous_word_idx:
        labels_ids.append(-100)
    elif word_idx != previous_word_idx:
        labels_ids.append(labels[word_idx])
    previous_word_idx = word_idx

labels = [index2tag[l] if l != -100 else "IGN" for l in labels_ids]
index = ["Tokens", "Word IDs", "Label IDs", "Labels"]

pd.DataFrame([tokens, word_ids, labels_ids, labels], index=index)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
Tokens,<s>,▁2.000,▁Einwohner,n,▁an,▁der,▁Dan,zi,ger,▁Buch,...,▁Wo,i,wod,schaft,▁Po,mmer,n,▁,.,</s>
Word IDs,,0,1,1,2,3,4,4,4,5,...,9,9,9,9,10,10,10,11,11,
Label IDs,-100,0,0,-100,0,0,5,-100,-100,6,...,5,-100,-100,-100,6,-100,-100,0,-100,-100
Labels,IGN,O,O,IGN,O,O,B-LOC,IGN,IGN,I-LOC,...,B-LOC,IGN,IGN,IGN,I-LOC,IGN,IGN,O,IGN,IGN


In [40]:
# Why did we select -100? PyTorch the cross-entropy loss class has an attribute ignore_index whose value is -100. This index is ignored during traning.
# We can clearly see how the label IDs algin with the tokensm so let's scale this out to the whole dataset

y] = labels
    return tokenized_inputs

# We now have the ingredients to encode each split, let's do it:
def encode_panx_dataset(corpus):
    return corpus.map(tokenize_and_align_labels, batched=True, remove_columns=["langs", "ner_tags", "tokens"])

panx_de_encoded = encode_panx_dataset(panx_ch["de"])


Map:   0%|          | 0/12580 [00:00<?, ? examples/s]

NameError: name 'idx' is not defined

### Performance metrics
Evaluating a NER model is similar to evaluating a text classifcation model and it is more common to report results for precision, recall and F1-score. However all words need correct prediction in order to be counted as correct. Luckily, there is a library called seqeval that is designed for these kinds of tasks.