<a href="https://colab.research.google.com/github/tilaboy/nlp_transformer_tutorial/blob/main/learning_notes/ch4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install datasets --quiet
!pip install transformers --quiet
!pip install tensorflow --quiet
!pip install pandas --quiet
!pip install numpy --quiet
!pip install seqeval --quiet


[K     |████████████████████████████████| 325 kB 9.5 MB/s 
[K     |████████████████████████████████| 212 kB 55.9 MB/s 
[K     |████████████████████████████████| 136 kB 45.9 MB/s 
[K     |████████████████████████████████| 77 kB 6.6 MB/s 
[K     |████████████████████████████████| 1.1 MB 53.6 MB/s 
[K     |████████████████████████████████| 127 kB 55.4 MB/s 
[K     |████████████████████████████████| 271 kB 58.3 MB/s 
[K     |████████████████████████████████| 144 kB 61.7 MB/s 
[K     |████████████████████████████████| 94 kB 3.4 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m
[K     |████████████████████████████████| 4.0 MB 7.6 MB/s 
[K     |████████████████████████████████| 6.6 MB 50.6 MB/s 
[K     |████████████████████████████████| 895 kB 63.6 MB/s

In [4]:
from datasets import load_dataset, DatasetDict
from datasets import get_dataset_config_names, concatenate_datasets
from collections import defaultdict, Counter
import pandas as pd
import numpy as np
import torch.nn as nn
from torch.nn.functional import cross_entropy
import torch
from transformers import AutoTokenizer, AutoConfig, TrainingArguments
from transformers import XLMRobertaConfig, Trainer
from transformers import DataCollatorForTokenClassification
from transformers.modeling_outputs import TokenClassifierOutput 
from transformers.models.roberta.modeling_roberta import RobertaModel 
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel
from seqeval.metrics import classification_report, f1_score


In [5]:
xtreme_subsets = get_dataset_config_names("xtreme")
print(f"XTREME has {len(xtreme_subsets)} configurations") 
panx_subsets = [s for s in xtreme_subsets if s.startswith("PAN")]
print(f'nr of languages in PAN dataset: {len(panx_subsets)}')
print([set_name[-2:] for set_name in panx_subsets])

Downloading builder script:   0%|          | 0.00/9.09k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/23.1k [00:00<?, ?B/s]

XTREME has 183 configurations
nr of languages in PAN dataset: 40
['af', 'ar', 'bg', 'bn', 'de', 'el', 'en', 'es', 'et', 'eu', 'fa', 'fi', 'fr', 'he', 'hi', 'hu', 'id', 'it', 'ja', 'jv', 'ka', 'kk', 'ko', 'ml', 'mr', 'ms', 'my', 'nl', 'pt', 'ru', 'sw', 'ta', 'te', 'th', 'tl', 'tr', 'ur', 'vi', 'yo', 'zh']


In [6]:
langs = ["de", "fr", "it", "en"] 
fracs = [0.629, 0.229, 0.084, 0.059] 
# Return a DatasetDict if a key doesn't exist 
panx_ch = defaultdict(DatasetDict) 
for lang, frac in zip(langs, fracs): 
    # Load monolingual corpus 
    ds = load_dataset("xtreme", name=f"PAN-X.{lang}") 
    # Shuffle and downsample each split according to spoken proportion 
    for split in ds:
        nr_to_select = int(frac * ds[split].num_rows)
        print(f'{lang}-{split}: {nr_to_select} out of {ds[split].num_rows}')
        panx_ch[lang][split] = ( ds[split].shuffle(seed=0).select(range(nr_to_select)))


Downloading and preparing dataset xtreme/PAN-X.de (download: 223.17 MiB, generated: 9.08 MiB, post-processed: Unknown size, total: 232.25 MiB) to /root/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/349258adc25bb45e47de193222f95e68a44f7a7ab53c4283b3f007208a11bf7e...


Downloading data:   0%|          | 0.00/234M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset xtreme downloaded and prepared to /root/.cache/huggingface/datasets/xtreme/PAN-X.de/1.0.0/349258adc25bb45e47de193222f95e68a44f7a7ab53c4283b3f007208a11bf7e. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

de-train: 12580 out of 20000
de-validation: 6290 out of 10000
de-test: 6290 out of 10000
Downloading and preparing dataset xtreme/PAN-X.fr (download: 223.17 MiB, generated: 6.37 MiB, post-processed: Unknown size, total: 229.53 MiB) to /root/.cache/huggingface/datasets/xtreme/PAN-X.fr/1.0.0/349258adc25bb45e47de193222f95e68a44f7a7ab53c4283b3f007208a11bf7e...


Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset xtreme downloaded and prepared to /root/.cache/huggingface/datasets/xtreme/PAN-X.fr/1.0.0/349258adc25bb45e47de193222f95e68a44f7a7ab53c4283b3f007208a11bf7e. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

fr-train: 4580 out of 20000
fr-validation: 2290 out of 10000
fr-test: 2290 out of 10000
Downloading and preparing dataset xtreme/PAN-X.it (download: 223.17 MiB, generated: 7.35 MiB, post-processed: Unknown size, total: 230.52 MiB) to /root/.cache/huggingface/datasets/xtreme/PAN-X.it/1.0.0/349258adc25bb45e47de193222f95e68a44f7a7ab53c4283b3f007208a11bf7e...


Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset xtreme downloaded and prepared to /root/.cache/huggingface/datasets/xtreme/PAN-X.it/1.0.0/349258adc25bb45e47de193222f95e68a44f7a7ab53c4283b3f007208a11bf7e. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

it-train: 1680 out of 20000
it-validation: 840 out of 10000
it-test: 840 out of 10000
Downloading and preparing dataset xtreme/PAN-X.en (download: 223.17 MiB, generated: 7.30 MiB, post-processed: Unknown size, total: 230.47 MiB) to /root/.cache/huggingface/datasets/xtreme/PAN-X.en/1.0.0/349258adc25bb45e47de193222f95e68a44f7a7ab53c4283b3f007208a11bf7e...


Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset xtreme downloaded and prepared to /root/.cache/huggingface/datasets/xtreme/PAN-X.en/1.0.0/349258adc25bb45e47de193222f95e68a44f7a7ab53c4283b3f007208a11bf7e. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

en-train: 1180 out of 20000
en-validation: 590 out of 10000
en-test: 590 out of 10000


In [9]:
for attr, attr_value in panx_ch["de"]["train"][0].items():
    print(attr, attr_value)

for attr, attr_value in panx_ch["de"]["train"].features.items():
    print(attr, attr_value)

tokens ['2.000', 'Einwohnern', 'an', 'der', 'Danziger', 'Bucht', 'in', 'der', 'polnischen', 'Woiwodschaft', 'Pommern', '.']
ner_tags [0, 0, 0, 0, 5, 6, 0, 0, 5, 5, 6, 0]
langs ['de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de']
tokens Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)
ner_tags Sequence(feature=ClassLabel(num_classes=7, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None), length=-1, id=None)
langs Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)


In [12]:
tags = panx_ch["de"]["train"].features["ner_tags"].feature
print(tags)
def create_tag_names(batch):
    return {'ner_tags_str': [tags.int2str(idx) for idx in batch['ner_tags']]}
  
panx_de = panx_ch["de"].map(create_tag_names)
de_example = panx_de["train"][0]
print(de_example)


ClassLabel(num_classes=7, names=['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None)


  0%|          | 0/12580 [00:00<?, ?ex/s]

  0%|          | 0/6290 [00:00<?, ?ex/s]

  0%|          | 0/6290 [00:00<?, ?ex/s]

{'tokens': ['2.000', 'Einwohnern', 'an', 'der', 'Danziger', 'Bucht', 'in', 'der', 'polnischen', 'Woiwodschaft', 'Pommern', '.'], 'ner_tags': [0, 0, 0, 0, 5, 6, 0, 0, 5, 5, 6, 0], 'langs': ['de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de', 'de'], 'ner_tags_str': ['O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O', 'O', 'B-LOC', 'B-LOC', 'I-LOC', 'O']}


In [13]:
split2freqs = defaultdict(Counter) 
for split, dataset in panx_de.items(): 
    for row in dataset["ner_tags_str"]: 
        for tag in row: 
            if tag.startswith("B"): 
                tag_type = tag.split("-")[1] 
                split2freqs[split][tag_type] += 1

print(split2freqs)


defaultdict(<class 'collections.Counter'>, {'train': Counter({'LOC': 6186, 'PER': 5810, 'ORG': 5366}), 'validation': Counter({'LOC': 3172, 'PER': 2893, 'ORG': 2683}), 'test': Counter({'LOC': 3180, 'PER': 3071, 'ORG': 2573})})


In [15]:
pd.DataFrame.from_dict(split2freqs, orient="index")

Unnamed: 0,LOC,ORG,PER
train,6186,5366,5810
validation,3172,2683,2893
test,3180,2573,3071


In [16]:
bert_model_name = "bert-base-cased" 
xlmr_model_name = "xlm-roberta-base" 
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name) 
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)


Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

In [17]:
text = "Jack Sparrow loves New York!" 
bert_tokens = bert_tokenizer(text).tokens() 
xlmr_tokens = xlmr_tokenizer(text).tokens()

In [18]:
print(bert_tokens)
print(xlmr_tokens)

['[CLS]', 'Jack', 'Spa', '##rrow', 'loves', 'New', 'York', '!', '[SEP]']
['<s>', '▁Jack', '▁Spar', 'row', '▁love', 's', '▁New', '▁York', '!', '</s>']
