In [None]:
#!python3 -m pip install --upgrade pip

In [1]:
!pip install datasets transformers
!pip install accelerate
# To run the training on TPU, you will need to uncomment the followin line:
# !pip install cloud-tpu-client==0.10 torch==1.9.0 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
!apt install git-lfs
!pip install -U huggingface_hub
!pip install evaluate

Collecting datasets
  Using cached datasets-2.2.2-py3-none-any.whl (346 kB)
Collecting transformers
  Using cached transformers-4.19.4-py3-none-any.whl (4.2 MB)
Collecting xxhash
  Using cached xxhash-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (211 kB)
Collecting requests>=2.19.0
  Using cached requests-2.28.0-py3-none-any.whl (62 kB)
Collecting responses<0.19
  Using cached responses-0.18.0-py3-none-any.whl (38 kB)
Collecting pyarrow>=6.0.0
  Using cached pyarrow-8.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
Collecting dill<0.3.5
  Using cached dill-0.3.4-py2.py3-none-any.whl (86 kB)
Collecting huggingface-hub<1.0.0,>=0.1.0
  Using cached huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
Collecting multiprocess
  Using cached multiprocess-0.70.13-py310-none-any.whl (133 kB)
Collecting tqdm>=4.62.1
  Using cached tqdm-4.64.0-py2.py3-none-any.whl (78 kB)
Collecting fsspec[http]>=2021.05.0
  Using cached fsspec-2022.5.0-py3-none-any.whl (14

Installing collected packages: evaluate
Successfully installed evaluate-0.1.1
You should consider upgrading via the '/mount/arbeitsdaten61/studenten3/advanced-ml/2022/maurerknuples/experiments/myenv/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
!pip list

Package              Version
-------------------- -----------
accelerate           0.9.0
aiohttp              3.8.1
aiosignal            1.2.0
argon2-cffi          21.3.0
argon2-cffi-bindings 21.2.0
asttokens            2.0.5
async-timeout        4.0.2
attrs                21.4.0
backcall             0.2.0
beautifulsoup4       4.11.1
bleach               5.0.0
certifi              2022.5.18.1
cffi                 1.15.0
charset-normalizer   2.0.12
datasets             2.2.2
debugpy              1.6.0
decorator            5.1.1
defusedxml           0.7.1
dill                 0.3.4
entrypoints          0.4
evaluate             0.1.1
executing            0.8.3
fastjsonschema       2.15.3
filelock             3.7.1
frozenlist           1.3.0
fsspec               2022.5.0
huggingface-hub      0.7.0
idna                 3.3
ipykernel            6.14.0
ipython              8.4.0
ipython-genutils     0.2.0
ipywidgets           7.7.0
jedi                 0.18.1
Jinja2               3.1.2
jsonsc

## Fine-tuning mBERT on SQuAD
We first have to fine-tune our mBERT model on the task of Question answering (QA).

In [9]:
from datasets import load_dataset

raw_datasets = load_dataset("squad")
#raw_datasets['train'] = raw_datasets['train'].shard(num_shards=40, index=0)
#raw_datasets['validation'] = raw_datasets['validation'].shard(num_shards=10, index=0)

Reusing dataset squad (/home/users1/knupleun/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

### Preprocessing the training dataset

In [10]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [11]:
max_length = 512
stride = 128

def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    context = examples["context"]
    
    # Tokenize question and context together into one input,
    # they will seperable with a special token between them.
    # Tokenizer will also split context into multiple chuncks,
    # if the max_length is exceeded.
    inputs = tokenizer(
        questions,
        context,
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # offset_mapping is the result of the split into
    # multiple chunks
    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    # This step is required to see if an answer is inside
    # the context chunks or not. It labels the multiple chunks
    # generated by the tokenizer into either not having
    # the answer, or where the answer is located
    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [12]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    context = examples["context"]
    
    # It's the same thing as with the preprocess_training_examples
    # tokenizer, but here 
    inputs = tokenizer(
        questions,
        context,
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [13]:
train_dataset = raw_datasets["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

validation_dataset = raw_datasets["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)

  0%|          | 0/88 [00:00<?, ?ba/s]

  0%|          | 0/11 [00:00<?, ?ba/s]

In [14]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

train_dataset.set_format("torch")
validation_set = validation_dataset.remove_columns(["example_id", "offset_mapping"])
validation_set.set_format("torch")

train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=8,
)

### Computing the metrics

In [15]:
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-bas

In [16]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)

In [18]:
from accelerate import Accelerator

accelerator = Accelerator(fp16=True)
model, optimizer, train_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader
)

In [19]:
from transformers import get_scheduler

num_train_epochs = 2
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [21]:
from tqdm.auto import tqdm
import numpy as np
import torch

progress_bar = tqdm(range(num_training_steps))

output_dir = './model'

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for step, batch in enumerate(train_dataloader):
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/21942 [00:00<?, ?it/s]

In [22]:
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
tokenizer.save_pretrained(output_dir)

('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/vocab.txt',
 './model/added_tokens.json',
 './model/tokenizer.json')

## Testing the fine-tuned model

In [14]:
output_dir = './model'
from tqdm.auto import tqdm
import numpy as np
import torch

In [15]:
from transformers import pipeline

question_answerer = pipeline("question-answering", model=output_dir)

## Validating using XQuAD

In [16]:
def get_predictions(dataset):
    
    predictions = []
    for example in tqdm(dataset):
        question = example['question']
        context = example['context']
        prediction = question_answerer(question=question, context=context)

        predictions.append(prediction)
    
    return predictions

In [17]:
# Need to convert the variables so that they can be used by the evaluation.compute function
def convert_for_evaluation(predictions, examples):
    ref = []
    pred = []
    for i, id in enumerate(examples['id']):
        ref.append({
            'answers': examples['answers'][i],
            'id': examples['id'][i]
        })
        pred.append({
            'prediction_text': predictions[i]['answer'],
            'id': examples['id'][i]
        })
        
    return pred, ref

In [18]:
from datasets import load_dataset
from evaluate import load

squad_metric = load("squad")
results = {}

languages = ["en", "es", "de", "el", "ru", "tr", "ar", "vi", "zh", "hi", "ro", "th"]
for lang in languages:
    dataset = load_dataset("xquad", 'xquad.' + lang)['validation']
    
    print('Running predictions for', lang)
    predictions = get_predictions(dataset)

    predictions, references = convert_for_evaluation(predictions, dataset) 
    res = squad_metric.compute(predictions=predictions, references=references)
    
    results[lang] = res

Reusing dataset xquad (/home/users1/knupleun/.cache/huggingface/datasets/xquad/xquad.en/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)


  0%|          | 0/1 [00:00<?, ?it/s]

Running predictions for en


  0%|          | 0/1190 [00:00<?, ?it/s]

Reusing dataset xquad (/home/users1/knupleun/.cache/huggingface/datasets/xquad/xquad.es/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)


  0%|          | 0/1 [00:00<?, ?it/s]

Running predictions for es


  0%|          | 0/1190 [00:00<?, ?it/s]

Reusing dataset xquad (/home/users1/knupleun/.cache/huggingface/datasets/xquad/xquad.de/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)


  0%|          | 0/1 [00:00<?, ?it/s]

Running predictions for de


  0%|          | 0/1190 [00:00<?, ?it/s]

Reusing dataset xquad (/home/users1/knupleun/.cache/huggingface/datasets/xquad/xquad.el/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)


  0%|          | 0/1 [00:00<?, ?it/s]

Running predictions for el


  0%|          | 0/1190 [00:00<?, ?it/s]

Downloading and preparing dataset xquad/xquad.ru (download: 13.32 MiB, generated: 2.04 MiB, post-processed: Unknown size, total: 15.35 MiB) to /home/users1/knupleun/.cache/huggingface/datasets/xquad/xquad.ru/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336...


Downloading data files:   0%|          | 0/12 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/12 [00:00<?, ?it/s]

Generating validation split:   0%|          | 0/1190 [00:00<?, ? examples/s]

Dataset xquad downloaded and prepared to /home/users1/knupleun/.cache/huggingface/datasets/xquad/xquad.ru/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Running predictions for ru


  0%|          | 0/1190 [00:00<?, ?it/s]

Downloading and preparing dataset xquad/xquad.tr (download: 13.32 MiB, generated: 1.15 MiB, post-processed: Unknown size, total: 14.47 MiB) to /home/users1/knupleun/.cache/huggingface/datasets/xquad/xquad.tr/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336...


Downloading data files:   0%|          | 0/12 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/12 [00:00<?, ?it/s]

Generating validation split:   0%|          | 0/1190 [00:00<?, ? examples/s]

Dataset xquad downloaded and prepared to /home/users1/knupleun/.cache/huggingface/datasets/xquad/xquad.tr/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Running predictions for tr


  0%|          | 0/1190 [00:00<?, ?it/s]

Downloading and preparing dataset xquad/xquad.ar (download: 13.32 MiB, generated: 1.64 MiB, post-processed: Unknown size, total: 14.96 MiB) to /home/users1/knupleun/.cache/huggingface/datasets/xquad/xquad.ar/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336...


Downloading data files:   0%|          | 0/12 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/12 [00:00<?, ?it/s]

Generating validation split:   0%|          | 0/1190 [00:00<?, ? examples/s]

Dataset xquad downloaded and prepared to /home/users1/knupleun/.cache/huggingface/datasets/xquad/xquad.ar/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Running predictions for ar


  0%|          | 0/1190 [00:00<?, ?it/s]

Downloading and preparing dataset xquad/xquad.vi (download: 13.32 MiB, generated: 1.41 MiB, post-processed: Unknown size, total: 14.72 MiB) to /home/users1/knupleun/.cache/huggingface/datasets/xquad/xquad.vi/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336...


Downloading data files:   0%|          | 0/12 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/12 [00:00<?, ?it/s]

Generating validation split:   0%|          | 0/1190 [00:00<?, ? examples/s]

Dataset xquad downloaded and prepared to /home/users1/knupleun/.cache/huggingface/datasets/xquad/xquad.vi/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Running predictions for vi


  0%|          | 0/1190 [00:00<?, ?it/s]

Downloading and preparing dataset xquad/xquad.zh (download: 13.32 MiB, generated: 961.17 KiB, post-processed: Unknown size, total: 14.25 MiB) to /home/users1/knupleun/.cache/huggingface/datasets/xquad/xquad.zh/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336...


Downloading data files:   0%|          | 0/12 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/12 [00:00<?, ?it/s]

Generating validation split:   0%|          | 0/1190 [00:00<?, ? examples/s]

Dataset xquad downloaded and prepared to /home/users1/knupleun/.cache/huggingface/datasets/xquad/xquad.zh/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Running predictions for zh


  0%|          | 0/1190 [00:00<?, ?it/s]

Downloading and preparing dataset xquad/xquad.hi (download: 13.32 MiB, generated: 2.56 MiB, post-processed: Unknown size, total: 15.87 MiB) to /home/users1/knupleun/.cache/huggingface/datasets/xquad/xquad.hi/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336...


Downloading data files:   0%|          | 0/12 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/12 [00:00<?, ?it/s]

Generating validation split:   0%|          | 0/1190 [00:00<?, ? examples/s]

Dataset xquad downloaded and prepared to /home/users1/knupleun/.cache/huggingface/datasets/xquad/xquad.hi/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Running predictions for hi


  0%|          | 0/1190 [00:00<?, ?it/s]

Downloading and preparing dataset xquad/xquad.ro (download: 13.32 MiB, generated: 1.24 MiB, post-processed: Unknown size, total: 14.55 MiB) to /home/users1/knupleun/.cache/huggingface/datasets/xquad/xquad.ro/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336...


Downloading data files:   0%|          | 0/12 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/12 [00:00<?, ?it/s]

Generating validation split:   0%|          | 0/1190 [00:00<?, ? examples/s]

Dataset xquad downloaded and prepared to /home/users1/knupleun/.cache/huggingface/datasets/xquad/xquad.ro/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Running predictions for ro


  0%|          | 0/1190 [00:00<?, ?it/s]

Downloading and preparing dataset xquad/xquad.th (download: 13.32 MiB, generated: 2.72 MiB, post-processed: Unknown size, total: 16.04 MiB) to /home/users1/knupleun/.cache/huggingface/datasets/xquad/xquad.th/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336...


Downloading data files:   0%|          | 0/12 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/12 [00:00<?, ?it/s]

Generating validation split:   0%|          | 0/1190 [00:00<?, ? examples/s]

Dataset xquad downloaded and prepared to /home/users1/knupleun/.cache/huggingface/datasets/xquad/xquad.th/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Running predictions for th


  0%|          | 0/1190 [00:00<?, ?it/s]

In [19]:
results

{'en': {'exact_match': 72.52100840336135, 'f1': 84.63381393318429},
 'es': {'exact_match': 56.134453781512605, 'f1': 75.03973047509686},
 'de': {'exact_match': 55.378151260504204, 'f1': 71.83606003345174},
 'el': {'exact_match': 44.621848739495796, 'f1': 60.00397764273023},
 'ru': {'exact_match': 52.6890756302521, 'f1': 69.68082354979049},
 'tr': {'exact_match': 39.2436974789916, 'f1': 53.94504953241661},
 'ar': {'exact_match': 44.53781512605042, 'f1': 61.05297599918095},
 'vi': {'exact_match': 49.075630252100844, 'f1': 68.70494098227722},
 'zh': {'exact_match': 48.65546218487395, 'f1': 57.28618113912226},
 'hi': {'exact_match': 43.109243697478995, 'f1': 58.05880441958488},
 'ro': {'exact_match': 58.99159663865546, 'f1': 72.45369131036566},
 'th': {'exact_match': 36.134453781512605, 'f1': 44.38268640789652}}