## Fine-tuning mBERT on SQuAD with task adapters

In [1]:
from datasets import load_dataset

raw_datasets = load_dataset("squad")
#raw_datasets['train'] = raw_datasets['train'].shard(num_shards=40, index=0)
#raw_datasets['validation'] = raw_datasets['validation'].shard(num_shards=10, index=0)

  from .autonotebook import tqdm as notebook_tqdm
Reusing dataset squad (/home/mmm/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
100%|██████████| 2/2 [00:00<00:00, 533.86it/s]


### Preprocessing the training dataset

In [2]:
from transformers import AutoTokenizer

model_checkpoint = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [3]:
max_length = 512
stride = 128

def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    context = examples["context"]
    
    # Tokenize question and context together into one input,
    # they will seperable with a special token between them.
    # Tokenizer will also split context into multiple chuncks,
    # if the max_length is exceeded.
    inputs = tokenizer(
        questions,
        context,
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # offset_mapping is the result of the split into
    # multiple chunks
    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    # This step is required to see if an answer is inside
    # the context chunks or not. It labels the multiple chunks
    # generated by the tokenizer into either not having
    # the answer, or where the answer is located
    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [4]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    context = examples["context"]
    
    # It's the same thing as with the preprocess_training_examples
    # tokenizer, but here 
    inputs = tokenizer(
        questions,
        context,
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [5]:
train_dataset = raw_datasets["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)

validation_dataset = raw_datasets["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names,
)

Loading cached processed dataset at /home/mmm/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-958f9bb5656a0ef1.arrow
Loading cached processed dataset at /home/mmm/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453/cache-b65529454bcbf523.arrow


In [6]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

train_dataset.set_format("torch")
validation_set = validation_dataset.remove_columns(["example_id", "offset_mapping"])
validation_set.set_format("torch")

train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=8,
)

validation_dataloader = DataLoader(
    validation_dataset,
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=8,
)

### Computing the metrics

In [7]:
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForQuestionAnswering: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream tas

In [8]:
adapter_name = 'squad_adapter'
model.add_adapter(adapter_name)
# model.load_adapter("./adapter_qa_2ep/")
model.train_adapter(adapter_name)

In [9]:
print(model)

XLMRobertaForQuestionAnswering(
  (shared_parameters): ModuleDict()
  (roberta): RobertaModel(
    (shared_parameters): ModuleDict()
    (invertible_adapters): ModuleDict()
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
              (prefix_tuning): PrefixTu

In [10]:
model.set_active_adapters(adapter_name)

In [11]:
batch_size = 8

In [12]:
from transformers import TrainingArguments

In [13]:
model_name = model_checkpoint.split("/")[-1]
args = TrainingArguments(
    f"{model_name}-adapter-squad",
    evaluation_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,
    weight_decay=0.01
)

In [14]:
from transformers import AdapterTrainer

In [15]:
trainer = AdapterTrainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    data_collator=default_data_collator,
    tokenizer=tokenizer
)

In [16]:
trainer.train()

***** Running training *****
  Num examples = 87872
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 21968
  0%|          | 0/21968 [00:00<?, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 5.93 GiB total capacity; 4.82 GiB already allocated; 70.62 MiB free; 4.88 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [19]:
model.save_adapter("./adapter_qa_6ep/", adapter_name)

Configuration saved in ./adapter_qa_6ep/adapter_config.json
Module weights saved in ./adapter_qa_6ep/pytorch_adapter.bin
Configuration saved in ./adapter_qa_6ep/head_config.json
Module weights saved in ./adapter_qa_6ep/pytorch_model_head.bin


In [15]:
adapter_checkpoint = "./adapter_qa_xlm_4ep/"
adapter_name = 'squad_adapter'
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
model.load_adapter(adapter_checkpoint)
model.set_active_adapters(adapter_name)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForQuestionAnswering: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream tas

## Testing the fine-tuned model

In [16]:
from transformers import pipeline

# Replace this with your own checkpoint
# model_checkpoint = adapter_checkpoint
question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer)

context = """
? Transformers is backed by the three most popular deep learning libraries ? Jax, PyTorch and TensorFlow ? with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""
question = "What is backed by deep learning libraries?"
question_answerer(question=question, context=context)

{'score': 0.5330999493598938, 'start': 3, 'end': 15, 'answer': 'Transformers'}

In [17]:
from tqdm import tqdm

## Validating using XQuAD

In [18]:
def get_predictions(dataset):
    
    predictions = []
    for example in tqdm(dataset):
        question = example['question']
        context = example['context']
        prediction = question_answerer(question=question, context=context)

        predictions.append(prediction)
    
    return predictions

In [19]:
# Need to convert the variables so that they can be used by the evaluation.compute function
def convert_for_evaluation(predictions, examples):
    ref = []
    pred = []
    for i, id in enumerate(examples['id']):
        ref.append({
            'answers': examples['answers'][i],
            'id': examples['id'][i]
        })
        pred.append({
            'prediction_text': predictions[i]['answer'],
            'id': examples['id'][i]
        })
        
    return pred, ref

In [20]:
from datasets import load_dataset
import evaluate

squad_metric = evaluate.load("squad")
results = {}

languages = ["en", "es", "de", "el", "ru", "tr", "ar", "vi", "zh", "hi", "ro", "th"]
for lang in languages:
    dataset = load_dataset("xquad", 'xquad.' + lang)['validation']
    
    print('Running predictions for', lang)
    predictions = get_predictions(dataset)

    predictions, references = convert_for_evaluation(predictions, dataset) 
    res = squad_metric.compute(predictions=predictions, references=references)
    
    results[lang] = res

Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.en/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 486.63it/s]


Running predictions for en


100%|██████████| 1190/1190 [05:01<00:00,  3.95it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.es/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 509.51it/s]


Running predictions for es


100%|██████████| 1190/1190 [05:45<00:00,  3.44it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.de/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 738.82it/s]


Running predictions for de


100%|██████████| 1190/1190 [05:31<00:00,  3.59it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.el/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 687.14it/s]


Running predictions for el


100%|██████████| 1190/1190 [07:21<00:00,  2.70it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.ru/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 577.97it/s]


Running predictions for ru


100%|██████████| 1190/1190 [05:50<00:00,  3.40it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.tr/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 491.48it/s]


Running predictions for tr


100%|██████████| 1190/1190 [05:05<00:00,  3.89it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.ar/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 709.22it/s]


Running predictions for ar


100%|██████████| 1190/1190 [05:40<00:00,  3.50it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.vi/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 570.89it/s]


Running predictions for vi


100%|██████████| 1190/1190 [05:10<00:00,  3.84it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.zh/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 493.22it/s]


Running predictions for zh


100%|██████████| 1190/1190 [04:42<00:00,  4.21it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.hi/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 422.81it/s]


Running predictions for hi


100%|██████████| 1190/1190 [06:19<00:00,  3.14it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.ro/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 778.74it/s]


Running predictions for ro


100%|██████████| 1190/1190 [06:04<00:00,  3.26it/s]
Reusing dataset xquad (/home/mmm/.cache/huggingface/datasets/xquad/xquad.th/1.0.0/39e1ff0497cbbfb79bbff61024031c10872bbd7c4fd8bc250207a965c39d3336)
100%|██████████| 1/1 [00:00<00:00, 627.23it/s]


Running predictions for th


100%|██████████| 1190/1190 [05:45<00:00,  3.44it/s]


#### Results for Adapter after 4 Epochs of training

In [21]:
results

{'en': {'exact_match': 70.58823529411765, 'f1': 81.92807378059324},
 'es': {'exact_match': 56.38655462184874, 'f1': 74.44198371555919},
 'de': {'exact_match': 55.378151260504204, 'f1': 71.78426388009316},
 'el': {'exact_match': 53.36134453781513, 'f1': 69.88288466249213},
 'ru': {'exact_match': 55.46218487394958, 'f1': 72.2157329253889},
 'tr': {'exact_match': 51.00840336134454, 'f1': 66.92431239875614},
 'ar': {'exact_match': 40.0, 'f1': 57.85652749543065},
 'vi': {'exact_match': 50.924369747899156, 'f1': 71.26272470130272},
 'zh': {'exact_match': 42.94117647058823, 'f1': 51.761668303685106},
 'hi': {'exact_match': 48.23529411764706, 'f1': 65.4816327282602},
 'ro': {'exact_match': 60.252100840336134, 'f1': 74.66969292814782},
 'th': {'exact_match': 55.54621848739496, 'f1': 65.60099797494749}}

#### Results for Adapter after 2 Epochs of training

In [15]:
results

{'en': {'exact_match': 68.31932773109244, 'f1': 79.57992577694824},
 'es': {'exact_match': 53.445378151260506, 'f1': 71.89479162287164},
 'de': {'exact_match': 53.109243697478995, 'f1': 69.68981945138748},
 'el': {'exact_match': 51.00840336134454, 'f1': 67.5047974154513},
 'ru': {'exact_match': 53.78151260504202, 'f1': 70.08281167508588},
 'tr': {'exact_match': 47.64705882352941, 'f1': 64.56189143969652},
 'ar': {'exact_match': 40.252100840336134, 'f1': 57.207838767946996},
 'vi': {'exact_match': 50.33613445378151, 'f1': 69.86261372946828},
 'zh': {'exact_match': 41.260504201680675, 'f1': 50.05538579067991},
 'hi': {'exact_match': 47.3109243697479, 'f1': 64.38864468134756},
 'ro': {'exact_match': 58.90756302521008, 'f1': 73.32993999535823},
 'th': {'exact_match': 55.21008403361345, 'f1': 65.07878909139407}}