## Evaluating XLM Roberta Base in MAD-X setting

### Setup

In [13]:
from transformers import AutoTokenizer

model_checkpoint = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [14]:
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForQuestionAnswering: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream tas

In [3]:
import transformers.adapters.composition as ac
import transformers.adapters.configuration as cf

adapter_checkpoint = "./adapter_qa_xlm_4ep/"
adapter_name = 'squad_adapter'
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
model.load_adapter(adapter_checkpoint)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForQuestionAnswering: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream tas

'squad_adapter'

In [27]:
from tqdm import tqdm

In [28]:
def get_predictions(dataset):
    
    predictions = []
    for example in tqdm(dataset):
        question = example['question']
        context = example['context']
        prediction = question_answerer(question=question, context=context)

        predictions.append(prediction)
    
    return predictions

In [29]:
# Need to convert the variables so that they can be used by the evaluation.compute function
def convert_for_evaluation(predictions, examples):
    ref = []
    pred = []
    for i, id in enumerate(examples['id']):
        ref.append({
            'answers': examples['answers'][i],
            'id': examples['id'][i]
        })
        pred.append({
            'prediction_text': predictions[i]['answer'],
            'id': examples['id'][i]
        })
        
    return pred, ref

In [7]:
from transformers import pipeline
from datasets import load_dataset
import evaluate

squad_metric = evaluate.load("squad")
results = {}

languages = ["en", "es", "de", "el", "ru", "tr", "ar", "vi", "zh", "hi", "ro", "th"]
for lang in languages:
    config = cf.AdapterConfig.load("pfeiffer", non_linearity="relu", reduction_factor=2)
    model.load_adapter(
        'en/wiki@ukp',
        config='pfeiffer',
        model_name='xlm-roberta-base',
        load_as=f'{lang}/wiki@ukp',
        source='ah'
    )
    model.active_adapters = ac.Stack(adapter_name, f"{lang}/wiki@ukp")
    dataset = load_dataset("xquad", 'xquad.' + lang)['validation']

    question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer)
    
    print('Running predictions for', lang)
    predictions = get_predictions(dataset)

    predictions, references = convert_for_evaluation(predictions, dataset) 
    res = squad_metric.compute(predictions=predictions, references=references)
    
    results[lang] = res

Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.en/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 349.70it/s]


Running predictions for en


  tensor = as_tensor(value)
  p_mask = np.asarray(
100%|██████████| 1190/1190 [05:41<00:00,  3.48it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.es/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 573.38it/s]


Running predictions for es


100%|██████████| 1190/1190 [06:02<00:00,  3.28it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.de/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 192.16it/s]


Running predictions for de


100%|██████████| 1190/1190 [05:59<00:00,  3.31it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.el/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 181.20it/s]


Running predictions for el


100%|██████████| 1190/1190 [07:47<00:00,  2.55it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.ru/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 173.93it/s]


Running predictions for ru


100%|██████████| 1190/1190 [06:34<00:00,  3.01it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.tr/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 207.90it/s]


Running predictions for tr


100%|██████████| 1190/1190 [05:34<00:00,  3.56it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.ar/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 690.88it/s]


Running predictions for ar


100%|██████████| 1190/1190 [06:16<00:00,  3.16it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.vi/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 140.41it/s]


Running predictions for vi


100%|██████████| 1190/1190 [06:20<00:00,  3.12it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.zh/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 247.85it/s]


Running predictions for zh


100%|██████████| 1190/1190 [05:25<00:00,  3.66it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.hi/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 201.58it/s]


Running predictions for hi


100%|██████████| 1190/1190 [07:10<00:00,  2.76it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.ro/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 100.91it/s]


Running predictions for ro


100%|██████████| 1190/1190 [06:06<00:00,  3.24it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.th/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 173.08it/s]


Running predictions for th


100%|██████████| 1190/1190 [06:00<00:00,  3.30it/s]


In [8]:
results

{'en': {'exact_match': 70.0, 'f1': 81.3260141492324},
 'es': {'exact_match': 55.04201680672269, 'f1': 73.43141574034739},
 'de': {'exact_match': 56.05042016806723, 'f1': 71.65397632156589},
 'el': {'exact_match': 52.60504201680672, 'f1': 69.2665514387606},
 'ru': {'exact_match': 55.79831932773109, 'f1': 72.17426785928329},
 'tr': {'exact_match': 50.252100840336134, 'f1': 66.15281654731778},
 'ar': {'exact_match': 38.99159663865546, 'f1': 57.27048573125125},
 'vi': {'exact_match': 51.34453781512605, 'f1': 71.26370816260918},
 'zh': {'exact_match': 43.02521008403362, 'f1': 51.89612208519768},
 'hi': {'exact_match': 47.563025210084035, 'f1': 64.48767852602755},
 'ro': {'exact_match': 60.588235294117645, 'f1': 74.63924105396464},
 'th': {'exact_match': 55.79831932773109, 'f1': 65.65526210484187}}

## Evaluating mBERT Base in MAD-X setting

In [24]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [25]:
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-bas

In [26]:
import transformers.adapters.composition as ac
import transformers.adapters.configuration as cf

adapter_checkpoint = "./adapter_qa_4ep/"
adapter_name = 'squad_adapter'
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
model.load_adapter(adapter_checkpoint)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-bas

'squad_adapter'

In [30]:
from transformers import pipeline
from datasets import load_dataset
import evaluate

squad_metric = evaluate.load("squad")
results = {}

languages = ["en", "es", "de", "el", "ru", "tr", "ar", "vi", "zh", "hi", "ro", "th"]
for lang in languages:
    config = cf.AdapterConfig.load("pfeiffer", non_linearity="relu", reduction_factor=2)
    model.load_adapter(
        'en/wiki@ukp',
        config='pfeiffer',
        model_name=model_checkpoint,
        load_as=f'{lang}/wiki@ukp',
        source='ah'
    )
    model.active_adapters = ac.Stack(adapter_name, f"{lang}/wiki@ukp")
    dataset = load_dataset("xquad", 'xquad.' + lang)['validation']

    question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer)
    
    print('Running predictions for', lang)
    predictions = get_predictions(dataset)

    predictions, references = convert_for_evaluation(predictions, dataset) 
    res = squad_metric.compute(predictions=predictions, references=references)
    
    results[lang] = res

Downloading: 24.2kB [00:00, 9.40MB/s]                   
Downloading: 100%|██████████| 28.2M/28.2M [00:07<00:00, 4.18MB/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.en/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 471.64it/s]


Running predictions for en


  tensor = as_tensor(value)
  p_mask = np.asarray(
100%|██████████| 1190/1190 [05:09<00:00,  3.85it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.es/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 527.25it/s]


Running predictions for es


100%|██████████| 1190/1190 [06:00<00:00,  3.30it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.de/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 601.08it/s]


Running predictions for de


100%|██████████| 1190/1190 [05:59<00:00,  3.31it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.el/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 753.83it/s]


Running predictions for el


100%|██████████| 1190/1190 [12:49<00:00,  1.55it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.ru/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 594.94it/s]


Running predictions for ru


100%|██████████| 1190/1190 [07:45<00:00,  2.56it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.tr/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 553.48it/s]


Running predictions for tr


100%|██████████| 1190/1190 [07:29<00:00,  2.65it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.ar/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 781.35it/s]


Running predictions for ar


100%|██████████| 1190/1190 [08:16<00:00,  2.40it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.vi/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 464.49it/s]


Running predictions for vi


100%|██████████| 1190/1190 [06:19<00:00,  3.14it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.zh/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 506.68it/s]


Running predictions for zh


100%|██████████| 1190/1190 [07:01<00:00,  2.83it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.hi/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 693.73it/s]


Running predictions for hi


100%|██████████| 1190/1190 [09:33<00:00,  2.08it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.ro/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 366.60it/s]


Running predictions for ro


100%|██████████| 1190/1190 [07:46<00:00,  2.55it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.th/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 727.93it/s]


Running predictions for th


100%|██████████| 1190/1190 [18:06<00:00,  1.10it/s]


In [31]:
results

{'en': {'exact_match': 70.92436974789916, 'f1': 82.46243639739903},
 'es': {'exact_match': 53.69747899159664, 'f1': 71.03978105417272},
 'de': {'exact_match': 53.78151260504202, 'f1': 68.61935031500029},
 'el': {'exact_match': 39.91596638655462, 'f1': 53.18143870056999},
 'ru': {'exact_match': 48.57142857142857, 'f1': 64.97603221042606},
 'tr': {'exact_match': 31.26050420168067, 'f1': 46.06795492089609},
 'ar': {'exact_match': 37.142857142857146, 'f1': 52.78504211048965},
 'vi': {'exact_match': 44.53781512605042, 'f1': 64.33508909038494},
 'zh': {'exact_match': 43.529411764705884, 'f1': 51.46198479391753},
 'hi': {'exact_match': 34.03361344537815, 'f1': 47.129948699019785},
 'ro': {'exact_match': 53.445378151260506, 'f1': 67.37462869784147},
 'th': {'exact_match': 25.630252100840337, 'f1': 31.068760837668403}}