## Fine-tuning mBERT on SQuAD with task adapters

In [1]:
from datasets import load_dataset

raw_datasets = load_dataset("squad")
#raw_datasets['train'] = raw_datasets['train'].shard(num_shards=40, index=0)
#raw_datasets['validation'] = raw_datasets['validation'].shard(num_shards=10, index=0)

  from .autonotebook import tqdm as notebook_tqdm
Reusing dataset squad (/home/mmm/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
100%|██████████| 2/2 [00:00<00:00, 746.58it/s]


### Preprocessing the training dataset

In [2]:
from transformers import AutoTokenizer

model_checkpoint = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [3]:
max_length = 512
stride = 128

def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    context = examples["context"]
    
    # Tokenize question and context together into one input,
    # they will seperable with a special token between them.
    # Tokenizer will also split context into multiple chuncks,
    # if the max_length is exceeded.
    inputs = tokenizer(
        questions,
        context,
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # offset_mapping is the result of the split into
    # multiple chunks
    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    # This step is required to see if an answer is inside
    # the context chunks or not. It labels the multiple chunks
    # generated by the tokenizer into either not having
    # the answer, or where the answer is located
    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [4]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    context = examples["context"]
    
    # It's the same thing as with the preprocess_training_examples
    # tokenizer, but here 
    inputs = tokenizer(
        questions,
        context,
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [5]:
train_dataset = raw_datasets["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

validation_dataset = raw_datasets["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)

Loading cached processed dataset at /home/mmm/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-92828b7751ed853e.arrow
Loading cached processed dataset at /home/mmm/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-6e87d240902843c3.arrow


In [6]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

train_dataset.set_format("torch")
validation_set = validation_dataset.remove_columns(["example_id", "offset_mapping"])
validation_set.set_format("torch")

train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=8,
)

validation_dataloader = DataLoader(
    validation_dataset,
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=8,
)

### Computing the metrics

In [7]:
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-bas

In [8]:
adapter_name = 'squad_adapter'
# model.add_adapter(adapter_name)
model.load_adapter("./adapter_qa_2ep/")
model.train_adapter(adapter_name)

In [9]:
print(model)

BertForQuestionAnswering(
  (shared_parameters): ModuleDict()
  (bert): BertModel(
    (shared_parameters): ModuleDict()
    (invertible_adapters): ModuleDict()
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (prefix_tuning): PrefixTuningShim(
                (pool): PrefixTu

In [10]:
model.set_active_adapters(adapter_name)

In [11]:
batch_size = 8

In [12]:
from transformers import TrainingArguments

In [13]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-adapter-squad",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,
    weight_decay=0.01
)

In [14]:
from transformers import AdapterTrainer

In [15]:
trainer = AdapterTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    data_collator=default_data_collator,
    tokenizer=tokenizer
)

In [18]:
trainer.train()

***** Running training *****
  Num examples = 87767
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 21942
  2%|▏         | 500/21942 [04:14<3:01:41,  1.97it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-500
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-500/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-500/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-500/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-500/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-500/tokenizer_config.json
Special tokens file saved in bert-base

{'loss': 0.8772, 'learning_rate': 0.0, 'epoch': 0.05}


  5%|▍         | 1000/21942 [08:16<2:45:58,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-1000
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-1000/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-1000/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-1000/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-1000/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-1000/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-1000/special_tokens_map.json


{'loss': 0.9081, 'learning_rate': 0.0, 'epoch': 0.09}


  7%|▋         | 1500/21942 [12:14<2:42:01,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-1500
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-1500/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-1500/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-1500/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-1500/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-1500/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-1500/special_tokens_map.json


{'loss': 0.8919, 'learning_rate': 0.0, 'epoch': 0.14}


  9%|▉         | 2000/21942 [16:13<2:38:24,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-2000
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-2000/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-2000/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-2000/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-2000/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-2000/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-2000/special_tokens_map.json


{'loss': 0.8707, 'learning_rate': 0.0, 'epoch': 0.18}


 11%|█▏        | 2500/21942 [20:11<2:34:11,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-2500
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-2500/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-2500/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-2500/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-2500/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-2500/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-2500/special_tokens_map.json


{'loss': 0.9196, 'learning_rate': 0.0, 'epoch': 0.23}


 14%|█▎        | 3000/21942 [24:09<2:30:08,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-3000
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-3000/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-3000/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-3000/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-3000/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-3000/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-3000/special_tokens_map.json


{'loss': 0.8977, 'learning_rate': 0.0, 'epoch': 0.27}


 16%|█▌        | 3500/21942 [28:08<2:26:30,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-3500
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-3500/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-3500/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-3500/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-3500/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-3500/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-3500/special_tokens_map.json


{'loss': 0.9183, 'learning_rate': 0.0, 'epoch': 0.32}


 18%|█▊        | 4000/21942 [32:06<2:22:24,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-4000
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-4000/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-4000/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-4000/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-4000/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-4000/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-4000/special_tokens_map.json


{'loss': 0.9135, 'learning_rate': 0.0, 'epoch': 0.36}


 21%|██        | 4500/21942 [36:04<2:18:30,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-4500
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-4500/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-4500/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-4500/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-4500/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-4500/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-4500/special_tokens_map.json


{'loss': 0.8948, 'learning_rate': 0.0, 'epoch': 0.41}


 23%|██▎       | 5000/21942 [40:03<2:14:33,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-5000
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-5000/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-5000/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-5000/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-5000/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-5000/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-5000/special_tokens_map.json


{'loss': 0.8836, 'learning_rate': 0.0, 'epoch': 0.46}


 25%|██▌       | 5500/21942 [44:01<2:10:37,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-5500
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-5500/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-5500/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-5500/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-5500/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-5500/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-5500/special_tokens_map.json


{'loss': 0.8705, 'learning_rate': 0.0, 'epoch': 0.5}


 27%|██▋       | 6000/21942 [48:10<2:14:51,  1.97it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-6000
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-6000/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-6000/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-6000/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-6000/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-6000/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-6000/special_tokens_map.json


{'loss': 0.888, 'learning_rate': 0.0, 'epoch': 0.55}


 30%|██▉       | 6500/21942 [52:19<2:02:32,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-6500
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-6500/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-6500/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-6500/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-6500/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-6500/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-6500/special_tokens_map.json


{'loss': 0.8851, 'learning_rate': 0.0, 'epoch': 0.59}


 32%|███▏      | 7000/21942 [56:17<1:58:28,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-7000
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-7000/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-7000/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-7000/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-7000/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-7000/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-7000/special_tokens_map.json


{'loss': 0.8967, 'learning_rate': 0.0, 'epoch': 0.64}


 34%|███▍      | 7500/21942 [1:00:15<1:54:33,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-7500
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-7500/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-7500/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-7500/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-7500/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-7500/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-7500/special_tokens_map.json


{'loss': 0.8921, 'learning_rate': 0.0, 'epoch': 0.68}


 36%|███▋      | 8000/21942 [1:04:13<1:50:38,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-8000
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-8000/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-8000/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-8000/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-8000/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-8000/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-8000/special_tokens_map.json


{'loss': 0.8804, 'learning_rate': 0.0, 'epoch': 0.73}


 39%|███▊      | 8500/21942 [1:08:12<1:46:35,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-8500
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-8500/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-8500/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-8500/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-8500/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-8500/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-8500/special_tokens_map.json


{'loss': 0.9017, 'learning_rate': 0.0, 'epoch': 0.77}


 41%|████      | 9000/21942 [1:12:10<1:42:39,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-9000
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-9000/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-9000/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-9000/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-9000/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-9000/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-9000/special_tokens_map.json


{'loss': 0.9061, 'learning_rate': 0.0, 'epoch': 0.82}


 43%|████▎     | 9500/21942 [1:16:08<1:38:43,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-9500
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-9500/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-9500/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-9500/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-9500/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-9500/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-9500/special_tokens_map.json


{'loss': 0.8834, 'learning_rate': 0.0, 'epoch': 0.87}


 46%|████▌     | 10000/21942 [1:20:07<1:34:43,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-10000
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-10000/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-10000/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-10000/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-10000/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-10000/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-10000/special_tokens_map.json


{'loss': 0.9426, 'learning_rate': 0.0, 'epoch': 0.91}


 48%|████▊     | 10500/21942 [1:24:05<1:30:44,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-10500
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-10500/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-10500/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-10500/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-10500/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-10500/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-10500/special_tokens_map.json


{'loss': 0.8869, 'learning_rate': 0.0, 'epoch': 0.96}


 50%|█████     | 10971/21942 [1:27:49<1:23:37,  2.19it/s]The following columns in the evaluation set  don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: example_id, offset_mapping.
***** Running Evaluation *****
  Num examples = 10634
  Batch size = 8

 50%|█████     | 10971/21942 [1:32:41<1:23:37,  2.19it/s]

{'eval_runtime': 292.0483, 'eval_samples_per_second': 36.412, 'eval_steps_per_second': 4.554, 'epoch': 1.0}


 50%|█████     | 11000/21942 [1:32:55<1:27:32,  2.08it/s]  Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-11000
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-11000/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-11000/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-11000/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-11000/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-11000/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-11000/special_tokens_map.json


{'loss': 0.9039, 'learning_rate': 0.0, 'epoch': 1.0}


 52%|█████▏    | 11500/21942 [1:36:54<1:22:49,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-11500
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-11500/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-11500/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-11500/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-11500/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-11500/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-11500/special_tokens_map.json


{'loss': 0.9155, 'learning_rate': 0.0, 'epoch': 1.05}


 55%|█████▍    | 12000/21942 [1:40:52<1:18:51,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-12000
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-12000/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-12000/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-12000/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-12000/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-12000/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-12000/special_tokens_map.json


{'loss': 0.9322, 'learning_rate': 0.0, 'epoch': 1.09}


 57%|█████▋    | 12500/21942 [1:44:50<1:14:58,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-12500
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-12500/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-12500/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-12500/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-12500/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-12500/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-12500/special_tokens_map.json


{'loss': 0.8965, 'learning_rate': 0.0, 'epoch': 1.14}


 59%|█████▉    | 13000/21942 [1:48:49<1:10:54,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-13000
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-13000/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-13000/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-13000/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-13000/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-13000/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-13000/special_tokens_map.json


{'loss': 0.9007, 'learning_rate': 0.0, 'epoch': 1.18}


 62%|██████▏   | 13500/21942 [1:52:47<1:06:58,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-13500
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-13500/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-13500/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-13500/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-13500/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-13500/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-13500/special_tokens_map.json


{'loss': 0.898, 'learning_rate': 0.0, 'epoch': 1.23}


 64%|██████▍   | 14000/21942 [1:56:45<1:03:02,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-14000
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-14000/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-14000/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-14000/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-14000/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-14000/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-14000/special_tokens_map.json


{'loss': 0.9053, 'learning_rate': 0.0, 'epoch': 1.28}


 66%|██████▌   | 14500/21942 [2:00:44<59:03,  2.10it/s]  Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-14500
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-14500/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-14500/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-14500/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-14500/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-14500/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-14500/special_tokens_map.json


{'loss': 0.8998, 'learning_rate': 0.0, 'epoch': 1.32}


 68%|██████▊   | 15000/21942 [2:04:42<55:03,  2.10it/s]  Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-15000
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-15000/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-15000/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-15000/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-15000/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-15000/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-15000/special_tokens_map.json


{'loss': 0.8896, 'learning_rate': 0.0, 'epoch': 1.37}


 71%|███████   | 15500/21942 [2:08:40<51:07,  2.10it/s]  Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-15500
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-15500/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-15500/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-15500/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-15500/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-15500/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-15500/special_tokens_map.json


{'loss': 0.8678, 'learning_rate': 0.0, 'epoch': 1.41}


 73%|███████▎  | 16000/21942 [2:12:39<47:10,  2.10it/s]  Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-16000
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-16000/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-16000/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-16000/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-16000/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-16000/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-16000/special_tokens_map.json


{'loss': 0.9083, 'learning_rate': 0.0, 'epoch': 1.46}


 75%|███████▌  | 16500/21942 [2:16:37<43:10,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-16500
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-16500/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-16500/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-16500/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-16500/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-16500/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-16500/special_tokens_map.json


{'loss': 0.8935, 'learning_rate': 0.0, 'epoch': 1.5}


 77%|███████▋  | 17000/21942 [2:20:35<39:15,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-17000
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-17000/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-17000/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-17000/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-17000/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-17000/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-17000/special_tokens_map.json


{'loss': 0.871, 'learning_rate': 0.0, 'epoch': 1.55}


 80%|███████▉  | 17500/21942 [2:24:34<35:14,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-17500
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-17500/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-17500/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-17500/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-17500/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-17500/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-17500/special_tokens_map.json


{'loss': 0.895, 'learning_rate': 0.0, 'epoch': 1.6}


 82%|████████▏ | 18000/21942 [2:28:32<31:17,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-18000
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-18000/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-18000/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-18000/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-18000/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-18000/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-18000/special_tokens_map.json


{'loss': 0.8825, 'learning_rate': 0.0, 'epoch': 1.64}


 84%|████████▍ | 18500/21942 [2:32:30<27:19,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-18500
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-18500/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-18500/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-18500/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-18500/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-18500/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-18500/special_tokens_map.json


{'loss': 0.8986, 'learning_rate': 0.0, 'epoch': 1.69}


 87%|████████▋ | 19000/21942 [2:36:28<23:20,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-19000
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-19000/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-19000/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-19000/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-19000/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-19000/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-19000/special_tokens_map.json


{'loss': 0.9006, 'learning_rate': 0.0, 'epoch': 1.73}


 89%|████████▉ | 19500/21942 [2:40:27<19:22,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-19500
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-19500/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-19500/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-19500/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-19500/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-19500/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-19500/special_tokens_map.json


{'loss': 0.8949, 'learning_rate': 0.0, 'epoch': 1.78}


 91%|█████████ | 20000/21942 [2:44:25<15:24,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-20000
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-20000/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-20000/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-20000/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-20000/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-20000/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-20000/special_tokens_map.json


{'loss': 0.8945, 'learning_rate': 0.0, 'epoch': 1.82}


 93%|█████████▎| 20500/21942 [2:48:23<11:27,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-20500
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-20500/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-20500/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-20500/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-20500/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-20500/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-20500/special_tokens_map.json


{'loss': 0.8483, 'learning_rate': 0.0, 'epoch': 1.87}


 96%|█████████▌| 21000/21942 [2:52:22<07:28,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-21000
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-21000/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-21000/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-21000/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-21000/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-21000/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-21000/special_tokens_map.json


{'loss': 0.8763, 'learning_rate': 0.0, 'epoch': 1.91}


 98%|█████████▊| 21500/21942 [2:56:20<03:30,  2.10it/s]Saving model checkpoint to bert-base-multilingual-cased-adapter-squad/checkpoint-21500
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-21500/squad_adapter/adapter_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-21500/squad_adapter/pytorch_adapter.bin
Configuration saved in bert-base-multilingual-cased-adapter-squad/checkpoint-21500/squad_adapter/head_config.json
Module weights saved in bert-base-multilingual-cased-adapter-squad/checkpoint-21500/squad_adapter/pytorch_model_head.bin
tokenizer config file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-21500/tokenizer_config.json
Special tokens file saved in bert-base-multilingual-cased-adapter-squad/checkpoint-21500/special_tokens_map.json


{'loss': 0.9005, 'learning_rate': 0.0, 'epoch': 1.96}


100%|██████████| 21942/21942 [2:59:51<00:00,  2.18it/s]The following columns in the evaluation set  don't have a corresponding argument in `BertForQuestionAnswering.forward` and have been ignored: example_id, offset_mapping.
***** Running Evaluation *****
  Num examples = 10634
  Batch size = 8

100%|██████████| 21942/21942 [3:04:43<00:00,  2.18it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 21942/21942 [3:04:43<00:00,  1.98it/s]

{'eval_runtime': 291.9771, 'eval_samples_per_second': 36.421, 'eval_steps_per_second': 4.555, 'epoch': 2.0}
{'train_runtime': 11083.1824, 'train_samples_per_second': 15.838, 'train_steps_per_second': 1.98, 'train_loss': 0.8951242328852771, 'epoch': 2.0}





TrainOutput(global_step=21942, training_loss=0.8951242328852771, metrics={'train_runtime': 11083.1824, 'train_samples_per_second': 15.838, 'train_steps_per_second': 1.98, 'train_loss': 0.8951242328852771, 'epoch': 2.0})

In [19]:
model.save_adapter("./adapter_qa_6ep/", adapter_name)

Configuration saved in ./adapter_qa_6ep/adapter_config.json
Module weights saved in ./adapter_qa_6ep/pytorch_adapter.bin
Configuration saved in ./adapter_qa_6ep/head_config.json
Module weights saved in ./adapter_qa_6ep/pytorch_model_head.bin


In [8]:
adapter_checkpoint = "./adapter_qa_6ep/"
adapter_name = 'squad_adapter'
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
model.load_adapter(adapter_checkpoint)
model.set_active_adapters(adapter_name)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-bas

## Testing the fine-tuned model

In [9]:
from transformers import pipeline

# Replace this with your own checkpoint
# model_checkpoint = adapter_checkpoint
question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer)

context = """
? Transformers is backed by the three most popular deep learning libraries ? Jax, PyTorch and TensorFlow ? with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""
question = "What is backed by deep learning libraries?"
question_answerer(question=question, context=context)

{'score': 0.7918965220451355, 'start': 3, 'end': 15, 'answer': 'Transformers'}

In [10]:
from tqdm import tqdm

## Validating using XQuAD

In [11]:
def get_predictions(dataset):
    
    predictions = []
    for example in tqdm(dataset):
        question = example['question']
        context = example['context']
        prediction = question_answerer(question=question, context=context)

        predictions.append(prediction)
    
    return predictions

In [12]:
# Need to convert the variables so that they can be used by the evaluation.compute function
def convert_for_evaluation(predictions, examples):
    ref = []
    pred = []
    for i, id in enumerate(examples['id']):
        ref.append({
            'answers': examples['answers'][i],
            'id': examples['id'][i]
        })
        pred.append({
            'prediction_text': predictions[i]['answer'],
            'id': examples['id'][i]
        })
        
    return pred, ref

In [13]:
from datasets import load_dataset
import evaluate

squad_metric = evaluate.load("squad")
results = {}

languages = ["en", "es", "de", "el", "ru", "tr", "ar", "vi", "zh", "hi", "ro", "th"]
for lang in languages:
    dataset = load_dataset("xquad", 'xquad.' + lang)['validation']
    
    print('Running predictions for', lang)
    predictions = get_predictions(dataset)

    predictions, references = convert_for_evaluation(predictions, dataset) 
    res = squad_metric.compute(predictions=predictions, references=references)
    
    results[lang] = res

Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.en/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 778.31it/s]


Running predictions for en


  tensor = as_tensor(value)
  p_mask = np.asarray(
100%|██████████| 1190/1190 [04:01<00:00,  4.92it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.es/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 623.60it/s]


Running predictions for es


100%|██████████| 1190/1190 [04:17<00:00,  4.63it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.de/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 832.20it/s]


Running predictions for de


100%|██████████| 1190/1190 [03:58<00:00,  4.99it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.el/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 762.60it/s]


Running predictions for el


100%|██████████| 1190/1190 [08:01<00:00,  2.47it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.ru/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 780.77it/s]


Running predictions for ru


100%|██████████| 1190/1190 [04:48<00:00,  4.13it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.tr/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 732.12it/s]


Running predictions for tr


100%|██████████| 1190/1190 [04:28<00:00,  4.44it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.ar/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 580.21it/s]


Running predictions for ar


100%|██████████| 1190/1190 [05:43<00:00,  3.47it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.vi/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 731.35it/s]


Running predictions for vi


100%|██████████| 1190/1190 [04:09<00:00,  4.77it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.zh/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 699.28it/s]


Running predictions for zh


100%|██████████| 1190/1190 [04:33<00:00,  4.35it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.hi/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 741.04it/s]


Running predictions for hi


100%|██████████| 1190/1190 [06:07<00:00,  3.24it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.ro/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 765.94it/s]


Running predictions for ro


100%|██████████| 1190/1190 [04:54<00:00,  4.04it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.th/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 734.55it/s]


Running predictions for th


100%|██████████| 1190/1190 [11:55<00:00,  1.66it/s]


#### Results for Adapter after 6 Epochs of Training

In [14]:
results

{'en': {'exact_match': 70.67226890756302, 'f1': 82.5247970156043},
 'es': {'exact_match': 55.04201680672269, 'f1': 73.57229446436855},
 'de': {'exact_match': 56.30252100840336, 'f1': 71.14567157643546},
 'el': {'exact_match': 43.109243697478995, 'f1': 56.378219636212144},
 'ru': {'exact_match': 52.52100840336134, 'f1': 68.55032134571113},
 'tr': {'exact_match': 36.134453781512605, 'f1': 52.10511404616692},
 'ar': {'exact_match': 43.69747899159664, 'f1': 60.071794671501856},
 'vi': {'exact_match': 47.64705882352941, 'f1': 68.2884260182452},
 'zh': {'exact_match': 47.64705882352941, 'f1': 56.072629051620595},
 'hi': {'exact_match': 40.08403361344538, 'f1': 53.37206132647197},
 'ro': {'exact_match': 57.64705882352941, 'f1': 71.08178076607291},
 'th': {'exact_match': 35.04201680672269, 'f1': 42.62885154061629}}

#### Results for Adapter after 4 Epochs of training

In [13]:
results

{'en': {'exact_match': 70.67226890756302, 'f1': 82.5247970156043},
 'es': {'exact_match': 55.04201680672269, 'f1': 73.57229446436855},
 'de': {'exact_match': 56.30252100840336, 'f1': 71.14567157643546},
 'el': {'exact_match': 43.109243697478995, 'f1': 56.378219636212144},
 'ru': {'exact_match': 52.52100840336134, 'f1': 68.55032134571113},
 'tr': {'exact_match': 36.134453781512605, 'f1': 52.10511404616692},
 'ar': {'exact_match': 43.69747899159664, 'f1': 60.071794671501856},
 'vi': {'exact_match': 47.64705882352941, 'f1': 68.2884260182452},
 'zh': {'exact_match': 47.64705882352941, 'f1': 56.072629051620595},
 'hi': {'exact_match': 40.08403361344538, 'f1': 53.37206132647197},
 'ro': {'exact_match': 57.64705882352941, 'f1': 71.08178076607291},
 'th': {'exact_match': 35.04201680672269, 'f1': 42.62885154061629}}

#### Results for Adapter after 2 Epochs of training

In [13]:
results

{'en': {'exact_match': 69.07563025210084, 'f1': 80.99319834889582},
 'es': {'exact_match': 54.78991596638655, 'f1': 72.82869266842113},
 'de': {'exact_match': 53.78151260504202, 'f1': 69.47479459163338},
 'el': {'exact_match': 43.109243697478995, 'f1': 56.37360689448754},
 'ru': {'exact_match': 52.10084033613445, 'f1': 67.37157635980115},
 'tr': {'exact_match': 34.78991596638655, 'f1': 50.36518862309762},
 'ar': {'exact_match': 43.36134453781513, 'f1': 59.31706160923957},
 'vi': {'exact_match': 47.47899159663866, 'f1': 67.62077693115245},
 'zh': {'exact_match': 47.226890756302524, 'f1': 55.9277711084433},
 'hi': {'exact_match': 38.65546218487395, 'f1': 52.36370796776133},
 'ro': {'exact_match': 56.80672268907563, 'f1': 70.42534043163606},
 'th': {'exact_match': 36.470588235294116, 'f1': 43.673269307723125}}