In [1]:
import warnings

warnings.filterwarnings("ignore")

# Multilingual Named Entity Recognition

This notebook will use XLM-RoBERTa to perform multilingual named entity recognition (NER) on a subset of the Cross-Lingual Transfer Evaluation of Multilingual Encoders (XTREME) benchmark called WikiAnn or PAN-X dataset. This dataset consists of texts from Wikipedia articles in many languages. Each article is annotated with `LOC` (location), `PER` (person) and `ORG` (organization) tags in the IOB2 format. In this format, a `B-` prefix indicates the beginning of an entity, and consecutive tokens belonging to the same entity are given an `I-` prefix. An `O` tag indicates that the token does not belong to any entity.

XLM-RoBERTa belongs to a class of multilingual transformers that use masked language modeling as a pretraining objective and are trained jointly in many languages, enabling *zero-shot cross-lingual transfer*.

# 1. Dataset

In [2]:
from datasets import get_dataset_config_names

xtreme_subsets = get_dataset_config_names("xtreme")
print(f"XTREME has {len(xtreme_subsets)} configurations")

XTREME has 183 configurations


In [3]:
panx_subsets = [s for s in xtreme_subsets if s.startswith("PAN")]
panx_subsets[:5]

['PAN-X.af', 'PAN-X.ar', 'PAN-X.bg', 'PAN-X.bn', 'PAN-X.de']

In [4]:
# build an imbalanced multilingual dataset from the XTREME PANX subsets
from collections import defaultdict
from datasets import DatasetDict, load_dataset

# return a DatasetDict if a key is not found
panx_ch = defaultdict(DatasetDict)

langs = ["de", "fr", "it", "en"]
fracs = [0.6, 0.2, 0.1, 0.1]

for lang, frac in zip(langs, fracs):
    # load monolingual corpus
    ds = load_dataset("xtreme", name=f"PAN-X.{lang}")
    # shuffle and downsample each split according to fracs proportions
    for split in ds:
        panx_ch[lang][split] = (
            ds[split]
            .shuffle()
            .select(range(int(frac * ds[split].num_rows))))

In [5]:
import pandas as pd

pd.DataFrame(
  { lang: [panx_ch[lang]["train"].num_rows] for lang in langs },
  index=["num_training_examples"]
)

Unnamed: 0,de,fr,it,en
num_training_examples,12000,4000,2000,2000


In [None]:
# Das einzige Modell war eine viertürige Limousine.
# .'. The only model was a four-door sedan.
element = panx_ch["de"]["train"][0]
for key, value in element.items():
    print(f"{key}: {value}")

In [None]:
for key, value in panx_ch["de"]["train"].features.items():
    print(f"{key}: {value}")

In [None]:
tags = panx_ch["de"]["train"].features["ner_tags"].feature
tags

In [None]:
panx_de = panx_ch["de"].map(lambda batch: {"ner_tags_str": [tags.int2str(idx) for idx in batch["ner_tags"]]})
de_example = panx_de["train"][0]
pd.DataFrame([de_example["tokens"], de_example["ner_tags_str"]], index=["tokens", "tags"])

In [None]:
from collections import Counter

split2freqs = defaultdict(Counter)
for split, dataset in panx_de.items():
    for row in dataset["ner_tags_str"]:
        for tag in row:
            if tag.startswith("B"):
                tag_type = tag.split("-")[1]
                split2freqs[split][tag_type] += 1

pd.DataFrame.from_dict(split2freqs, orient="index")

# 2. Multilingual Transformers

# 3. A Closer Look at Tokenization

# 4. Transformers for Named Entity Recognition

# 5. The Anatomy of the Transformers Model Class

# 6. Tokenizing Texts for NER

# 7. Performance Measures

# 8. Fine-Tuning XLM-RoBERTa

# 9. Error Analysis

# 10. Cross-Lingual Transfer