In [3]:
from transformers import AutoTokenizer
from transformers import TrainingArguments
from transformers import AdapterTrainer
from datasets import load_dataset, load_from_disk
from transformers import AutoModelForQuestionAnswering
from torch.utils.data import DataLoader
from transformers import default_data_collator
import evaluate
from tqdm import tqdm
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
model_checkpoint = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [5]:
max_length = 512
stride = 128

def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    context = examples["context"]
    
    # Tokenize question and context together into one input,
    # they will seperable with a special token between them.
    # Tokenizer will also split context into multiple chuncks,
    # if the max_length is exceeded.
    inputs = tokenizer(
        questions,
        context,
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # offset_mapping is the result of the split into
    # multiple chunks
    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    # This step is required to see if an answer is inside
    # the context chunks or not. It labels the multiple chunks
    # generated by the tokenizer into either not having
    # the answer, or where the answer is located
    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [6]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    context = examples["context"]
    
    # It's the same thing as with the preprocess_training_examples
    # tokenizer, but here 
    inputs = tokenizer(
        questions,
        context,
        max_length=max_length,
        truncation="only_second",
        stride=stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [7]:
langs = ["en", "es", "de", "el", "ru", "tr", "ar", "vi", "zh", "hi", "ro", "th"]

In [8]:
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForQuestionAnswering: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream tas

In [7]:
batch_size = 8
epochs = 15

In [8]:
for lang in langs:
    train = load_from_disk(f'../data/xquad_{lang}_train.hf/')
    val = load_from_disk(f'../data/xquad_{lang}_val.hf/')
    train_dataset = train.map(
        preprocess_training_examples,
        batched=True,
        remove_columns=train.column_names,
    )

    validation_dataset = val.map(
        preprocess_validation_examples,
        batched=True,
        remove_columns=val.column_names,
    )
    train_dataset.set_format("torch")
    validation_set = validation_dataset.remove_columns(["example_id", "offset_mapping"])
    validation_set.set_format("torch")

    train_dataloader = DataLoader(
        train_dataset,
        shuffle=True,
        collate_fn=default_data_collator,
        batch_size=8,
    )

    validation_dataloader = DataLoader(
        validation_dataset,
        shuffle=True,
        collate_fn=default_data_collator,
        batch_size=8,
    )
    adapter_name = 'squad_adapter'
    # model.add_adapter(adapter_name)
    model.load_adapter("../adapter_weights/adapter_qa_en_xlm_4ep/")
    model.train_adapter(adapter_name)
    model.set_active_adapters(adapter_name)
    model_name = model_checkpoint.split("/")[-1]
    args = TrainingArguments(
        f"{model_name}-adapter-squad",
        evaluation_strategy = "epoch",
        learning_rate=1e-4,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs,
        weight_decay=0.01
    )
    trainer = AdapterTrainer(
        model,
        args,
        train_dataset=train_dataset,
        eval_dataset=validation_dataset,
        data_collator=default_data_collator,
        tokenizer=tokenizer
    )
    trainer.train()
    model.save_adapter(f"../adapter_qa_en-{lang}_mbert_{epochs}", adapter_name=adapter_name)
    del train, val

100%|██████████| 1/1 [00:00<00:00,  2.81ba/s]
100%|██████████| 1/1 [00:02<00:00,  2.53s/ba]
***** Running training *****
  Num examples = 727
  Num Epochs = 15
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1365
  0%|          | 0/1365 [00:00<?, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 12.00 MiB (GPU 0; 5.93 GiB total capacity; 4.96 GiB already allocated; 9.44 MiB free; 5.00 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
from transformers import pipeline

# Replace this with your own checkpoint
# model_checkpoint = adapter_checkpoint
question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer)

context = """
? Transformers is backed by the three most popular deep learning libraries ? Jax, PyTorch and TensorFlow ? with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""
question = "What is backed by deep learning libraries?"
question_answerer(question=question, context=context)

In [9]:
def get_predictions(dataset):
    
    predictions = []
    for example in tqdm(dataset):
        question = example['question']
        context = example['context']
        prediction = question_answerer(question=question, context=context)

        predictions.append(prediction)
    
    return predictions

In [10]:
# Need to convert the variables so that they can be used by the evaluation.compute function
def convert_for_evaluation(predictions, examples):
    ref = []
    pred = []
    for i, id in enumerate(examples['id']):
        ref.append({
            'answers': examples['answers'][i],
            'id': examples['id'][i]
        })
        pred.append({
            'prediction_text': predictions[i]['answer'],
            'id': examples['id'][i]
        })
        
    return pred, ref

In [11]:
squad_metric = evaluate.load("squad")
results_en2lang = {}
results_en = {}

for lang in langs:
    test = load_from_disk(f'../data/xquad_{lang}_test.hf/')

    adapter_name = 'squad_adapter'
    model.load_adapter("../adapter_weights/adapter_qa_en_xlm_4ep/")
    model.train_adapter(adapter_name)
    model.set_active_adapters(adapter_name)

    question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer)
    print('Running predictions for', lang)
    predictions = get_predictions(test)

    predictions, references = convert_for_evaluation(predictions, test) 
    res = squad_metric.compute(predictions=predictions, references=references)
    
    results_en[lang] = res
    del res
    adapter_name = 'squad_adapter'
    model.load_adapter(f"../adapter_weights//adapter_qa_en-{lang}_xlm_15")
    model.train_adapter(adapter_name)
    model.set_active_adapters(adapter_name)

    question_answerer = pipeline("question-answering", model=model, tokenizer=tokenizer)
    print(f'Running predictions for en2{lang}')
    predictions = get_predictions(test)

    predictions, references = convert_for_evaluation(predictions, test) 
    res = squad_metric.compute(predictions=predictions, references=references)
    results_en2lang[lang] = res

Running predictions for en


  tensor = as_tensor(value)
  p_mask = np.asarray(
100%|██████████| 238/238 [01:05<00:00,  3.66it/s]
Overwriting existing adapter 'squad_adapter'.


Running predictions for en2en


100%|██████████| 238/238 [01:01<00:00,  3.84it/s]
Overwriting existing adapter 'squad_adapter'.


Running predictions for es


100%|██████████| 238/238 [01:14<00:00,  3.19it/s]
Overwriting existing adapter 'squad_adapter'.


Running predictions for en2es


100%|██████████| 238/238 [01:16<00:00,  3.10it/s]
Overwriting existing adapter 'squad_adapter'.


Running predictions for de


100%|██████████| 238/238 [01:09<00:00,  3.43it/s]
Overwriting existing adapter 'squad_adapter'.


Running predictions for en2de


100%|██████████| 238/238 [01:09<00:00,  3.41it/s]
Overwriting existing adapter 'squad_adapter'.


Running predictions for el


100%|██████████| 238/238 [01:34<00:00,  2.51it/s]
Overwriting existing adapter 'squad_adapter'.


Running predictions for en2el


100%|██████████| 238/238 [01:40<00:00,  2.36it/s]
Overwriting existing adapter 'squad_adapter'.


Running predictions for ru


100%|██████████| 238/238 [01:14<00:00,  3.18it/s]
Overwriting existing adapter 'squad_adapter'.


Running predictions for en2ru


100%|██████████| 238/238 [01:15<00:00,  3.15it/s]
Overwriting existing adapter 'squad_adapter'.


Running predictions for tr


100%|██████████| 238/238 [01:11<00:00,  3.33it/s]
Overwriting existing adapter 'squad_adapter'.


Running predictions for en2tr


100%|██████████| 238/238 [01:09<00:00,  3.44it/s]
Overwriting existing adapter 'squad_adapter'.


Running predictions for ar


100%|██████████| 238/238 [01:08<00:00,  3.48it/s]
Overwriting existing adapter 'squad_adapter'.


Running predictions for en2ar


100%|██████████| 238/238 [01:08<00:00,  3.45it/s]
Overwriting existing adapter 'squad_adapter'.


Running predictions for vi


100%|██████████| 238/238 [01:12<00:00,  3.28it/s]
Overwriting existing adapter 'squad_adapter'.


Running predictions for en2vi


100%|██████████| 238/238 [01:13<00:00,  3.26it/s]
Overwriting existing adapter 'squad_adapter'.


Running predictions for zh


100%|██████████| 238/238 [01:02<00:00,  3.83it/s]
Overwriting existing adapter 'squad_adapter'.


Running predictions for en2zh


100%|██████████| 238/238 [01:00<00:00,  3.94it/s]
Overwriting existing adapter 'squad_adapter'.


Running predictions for hi


100%|██████████| 238/238 [01:27<00:00,  2.72it/s]
Overwriting existing adapter 'squad_adapter'.


Running predictions for en2hi


100%|██████████| 238/238 [01:26<00:00,  2.76it/s]
Overwriting existing adapter 'squad_adapter'.


Running predictions for ro


100%|██████████| 238/238 [01:22<00:00,  2.87it/s]
Overwriting existing adapter 'squad_adapter'.


Running predictions for en2ro


100%|██████████| 238/238 [01:25<00:00,  2.78it/s]
Overwriting existing adapter 'squad_adapter'.


Running predictions for th


100%|██████████| 238/238 [01:18<00:00,  3.04it/s]
Overwriting existing adapter 'squad_adapter'.


Running predictions for en2th


100%|██████████| 238/238 [01:16<00:00,  3.09it/s]


In [12]:
results_en  # only using english task adapter on subset

{'en': {'exact_match': 71.00840336134453, 'f1': 81.94217476767497},
 'es': {'exact_match': 50.0, 'f1': 69.27222578395927},
 'de': {'exact_match': 47.89915966386555, 'f1': 64.73047797923144},
 'el': {'exact_match': 50.42016806722689, 'f1': 68.45820280669516},
 'ru': {'exact_match': 51.260504201680675, 'f1': 67.82701186228391},
 'tr': {'exact_match': 45.378151260504204, 'f1': 60.22276065866142},
 'ar': {'exact_match': 36.554621848739494, 'f1': 57.80402927370693},
 'vi': {'exact_match': 47.47899159663866, 'f1': 65.72356569989515},
 'zh': {'exact_match': 39.075630252100844, 'f1': 50.63025210084035},
 'hi': {'exact_match': 49.15966386554622, 'f1': 64.62146608743353},
 'ro': {'exact_match': 58.403361344537814, 'f1': 73.20674907931328},
 'th': {'exact_match': 48.739495798319325, 'f1': 57.78511404561824}}

In [13]:
results_en2lang  # using english pretrained and langspecific finetuned adapter on subset

{'en': {'exact_match': 62.60504201680672, 'f1': 76.01487374052877},
 'es': {'exact_match': 50.0, 'f1': 69.69461363736977},
 'de': {'exact_match': 50.0, 'f1': 64.25893108519593},
 'el': {'exact_match': 48.739495798319325, 'f1': 65.57442365400844},
 'ru': {'exact_match': 53.78151260504202, 'f1': 69.6589237281149},
 'tr': {'exact_match': 46.21848739495798, 'f1': 59.798713754543215},
 'ar': {'exact_match': 35.714285714285715, 'f1': 54.88073250401867},
 'vi': {'exact_match': 48.739495798319325, 'f1': 68.21127289080526},
 'zh': {'exact_match': 50.42016806722689, 'f1': 55.34547152194211},
 'hi': {'exact_match': 48.739495798319325, 'f1': 63.35872030711598},
 'ro': {'exact_match': 54.20168067226891, 'f1': 69.34822885205983},
 'th': {'exact_match': 55.88235294117647, 'f1': 62.09883953581432}}