The task is to try to finetune the question answering pipeline in Transformers.

In [1]:
import numpy as np
from datasets import load_dataset, load_metric
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from tqdm.autonotebook import tqdm

In [2]:
# load the Stanford Question Answering Dataset
raw_datasets = load_dataset("squad")
raw_datasets

Found cached dataset squad (/Users/valentine/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [3]:
# check a sample
raw_datasets["train"][1]["title"]

'University_of_Notre_Dame'

In [4]:
raw_datasets["train"][1]["context"]

'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.'

In [5]:
raw_datasets["train"][1]["question"]

'What is in front of the Notre Dame Main Building?'

In [6]:
raw_datasets["train"][1]["answers"]

{'text': ['a copper statue of Christ'], 'answer_start': [188]}

In [7]:
# create a checkpoint and a tokenizer
model_checkpoint = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [8]:
# create a function to find the index of an answer token
def find_answer_token_idx(ctx_start, ctx_end, ans_start_char, ans_end_char, offset):
    start_idx = 0
    end_idx = 0
    if offset[ctx_start][0] > ans_start_char or offset[ctx_end][1] < ans_end_char:
        # the target is (0, 0), nothing to do
        pass
    else:
        # find the start and end token positions
        i = ctx_start
        for start_end_char in offset[ctx_start:]:
            start, end = start_end_char
            if start == ans_start_char:
                start_idx = i
            if end == ans_end_char:
                end_idx = i
                break
            i += 1
    return start_idx, end_idx

In [9]:
max_length = 384
stride = 128

# create a function to tokenize the train set
def tokenize_fn_train(batch):
    # remove extra whitespaces
    questions = [q.strip() for q in batch['question']]
    # tokenize the data with padding
    inputs = tokenizer(questions,
                       batch['context'],
                       max_length=max_length,
                       truncation='only_second',
                       stride=stride,
                       return_overflowing_tokens=True,
                       return_offsets_mapping=True,
                       padding='max_length')
    
    offset_mapping = inputs.pop('offset_mapping')
    orig_sample_idxs = inputs.pop('overflow_to_sample_mapping')
    answers = batch['answers']
    start_idxs, end_idxs = [], []
    
    for i, offset in enumerate(offset_mapping):
        sample_idx = orig_sample_idxs[i]
        answer = answers[sample_idx]
        answer_start_char = answer['answer_start'][0]
        answer_end_char = answer_start_char + len(answer['text'][0])
        sequence_ids = inputs.sequence_ids(i)
        # find start + end of context (first 1 and last 1)
        context_start = sequence_ids.index(1)
        context_end = len(sequence_ids) - sequence_ids[::-1].index(1) - 1   
        start_idx, end_idx = find_answer_token_idx(context_start,
                                                   context_end,
                                                   answer_start_char,
                                                   answer_end_char,
                                                   offset)
        start_idxs.append(start_idx)
        end_idxs.append(end_idx)
    
    inputs['start_positions'] = start_idxs
    inputs['end_positions'] = end_idxs
    return inputs

In [10]:
train_dataset = raw_datasets["train"].map(tokenize_fn_train,
                                          batched=True,
                                          remove_columns=raw_datasets["train"].column_names)
len(raw_datasets["train"]), len(train_dataset)

  0%|          | 0/88 [00:00<?, ?ba/s]

(87599, 88729)

In [11]:
# create a function to tokenize the validation set
# the targets are not needed since they will be compared with the original answer
def tokenize_fn_validation(batch):
    # remove extra whitespaces
    questions = [q.strip() for q in batch['question']]
    # tokenize the data with padding
    inputs = tokenizer(questions,
                       batch['context'],
                       max_length=max_length,
                       truncation='only_second',
                       stride=stride,
                       return_overflowing_tokens=True,
                       return_offsets_mapping=True,
                       padding='max_length')
    
    orig_sample_idxs = inputs.pop('overflow_to_sample_mapping')
    sample_ids = []
    # rewrite offset mapping by replacing question tuples with None
    for i in range(len(inputs['input_ids'])):
        sample_idx = orig_sample_idxs[i]
        sample_ids.append(batch['id'][sample_idx])
        sequence_ids = inputs.sequence_ids(i)
        offset = inputs['offset_mapping'][i]
        inputs['offset_mapping'][i] = [x if sequence_ids[j] == 1 else None for j, x in enumerate(offset)]
        
    inputs['sample_id'] = sample_ids    
    return inputs

In [12]:
validation_dataset = raw_datasets["validation"].map(tokenize_fn_validation,
                                                    batched=True,
                                                    remove_columns=raw_datasets["validation"].column_names)
len(raw_datasets["validation"]), len(validation_dataset)

  0%|          | 0/11 [00:00<?, ?ba/s]

(10570, 10822)

In [13]:
# load evaluation metrics
metric = load_metric("squad")

  


In [14]:
n_largest = 20
max_answer_length = 30
# create a function to compute evaluation metrics
def compute_metrics(start_logits, end_logits, processed_dataset, orig_dataset):
    sample_id2idxs = {}
    for i, id_ in enumerate(processed_dataset['sample_id']):
        if id_ not in sample_id2idxs:
            sample_id2idxs[id_] = [i]
        else:
            sample_id2idxs[id_].append(i)
            
    predicted_answers = []
    for sample in tqdm(orig_dataset):
        sample_id = sample['id']
        context = sample['context']
        # update the scores when looping through candidate answers
        best_score = float('-inf')
        best_answer = None
        # loop through the expanded input samples (fixed size context windows)
        # pick the highest probability start/end combination
        for idx in sample_id2idxs[sample_id]:
            start_logit = start_logits[idx]
            end_logit = end_logits[idx]
            offsets = processed_dataset[idx]['offset_mapping']
            start_indices = (-start_logit).argsort()
            end_indices = (-end_logit).argsort()
            
            for start_idx in start_indices[:n_largest]:
                for end_idx in end_indices[n_largest:]:
                    # skip answers not contained in context window
                    if offsets[start_idx] is None or offsets[end_idx] is None:
                        continue
                    # skip answers where end < start
                    if end_idx < start_idx:
                        continue
                    # skip answers that are too long
                    if end_idx - start_idx + 1 > max_answer_length:
                        continue
                    score = start_logit[start_idx] + end_logit[end_idx]
                    if score > best_score:
                        best_score = score
                        # find positions of start and end characters
                        first_ch = offsets[start_idx][0]
                        last_ch = offsets[end_idx][1]
                        best_answer = context[first_ch:last_ch]
        # save best answer
        predicted_answers.append({'id': sample_id, 'prediction_text': best_answer})
    # compute the metrics
    true_answers = [{'id': x['id'], 'answers': x['answers']} for x in orig_dataset]
    
    return metric.compute(predictions=predicted_answers, references=true_answers)

In [15]:
# use the model with our checkpoint
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForQuestionAnswering: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on

In [16]:
# define training arguments
args = TrainingArguments("finetuned-squad",
                         evaluation_strategy="no",
                         save_strategy="epoch",
                         learning_rate=2e-5,
                         num_train_epochs=3,
                         weight_decay=0.01)

In [17]:
# use the trainer and train the model
trainer = Trainer(model=model,
                  args=args,
                  train_dataset=train_dataset.shuffle().select(range(1000)),
                  eval_dataset=validation_dataset,
                  tokenizer=tokenizer)
trainer.train()

***** Running training *****
  Num examples = 1000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 375
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


Saving model checkpoint to finetuned-squad/checkpoint-125
Configuration saved in finetuned-squad/checkpoint-125/config.json
Model weights saved in finetuned-squad/checkpoint-125/pytorch_model.bin
tokenizer config file saved in finetuned-squad/checkpoint-125/tokenizer_config.json
Special tokens file saved in finetuned-squad/checkpoint-125/special_tokens_map.json
Saving model checkpoint to finetuned-squad/checkpoint-250
Configuration saved in finetuned-squad/checkpoint-250/config.json
Model weights saved in finetuned-squad/checkpoint-250/pytorch_model.bin
tokenizer config file saved in finetuned-squad/checkpoint-250/tokenizer_config.json
Special tokens file saved in finetuned-squad/checkpoint-250/special_tokens_map.json
Saving model checkpoint to finetuned-squad/checkpoint-375
Configuration saved in finetuned-squad/checkpoint-375/config.json
Model weights saved in finetuned-squad/checkpoint-375/pytorch_model.bin
tokenizer config file saved in finetuned-squad/checkpoint-375/tokenizer_conf

TrainOutput(global_step=375, training_loss=3.6120107421875, metrics={'train_runtime': 1384.6235, 'train_samples_per_second': 2.167, 'train_steps_per_second': 0.271, 'total_flos': 293969475072000.0, 'train_loss': 3.6120107421875, 'epoch': 3.0})

In [18]:
# make predictions and evaluate
trainer_output = trainer.predict(validation_dataset)

The following columns in the test set don't have a corresponding argument in `DistilBertForQuestionAnswering.forward` and have been ignored: offset_mapping, sample_id. If offset_mapping, sample_id are not expected by `DistilBertForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 10822
  Batch size = 8


In [19]:
trainer_output

PredictionOutput(predictions=(array([[-0.42320642, -4.2525625 , -3.7858965 , ..., -6.6363916 ,
        -6.6032457 , -6.598208  ],
       [-0.4200905 , -4.2481155 , -3.756441  , ..., -6.6373706 ,
        -6.605947  , -6.5995436 ],
       [-0.42662624, -4.176383  , -5.122679  , ..., -6.5414276 ,
        -6.5389147 , -6.5705695 ],
       ...,
       [-0.5383989 , -4.5240765 , -5.186863  , ..., -6.5090423 ,
        -6.502405  , -6.480877  ],
       [-0.5170208 , -4.560294  , -5.08591   , ..., -6.5650644 ,
        -6.497587  , -6.523964  ],
       [-0.4799257 , -4.4987035 , -5.224111  , ..., -6.5554504 ,
        -6.535227  , -6.511924  ]], dtype=float32), array([[-0.60021526, -4.3629136 , -4.563427  , ..., -5.8111224 ,
        -5.834797  , -5.8093867 ],
       [-0.6049708 , -4.36707   , -4.5640197 , ..., -5.8075447 ,
        -5.830118  , -5.805607  ],
       [-0.5852223 , -4.305908  , -4.573474  , ..., -5.8769684 ,
        -5.8873005 , -5.811081  ],
       ...,
       [-0.6049764 , -4.44682

In [20]:
predictions, _, _ = trainer_output

In [21]:
predictions

(array([[-0.42320642, -4.2525625 , -3.7858965 , ..., -6.6363916 ,
         -6.6032457 , -6.598208  ],
        [-0.4200905 , -4.2481155 , -3.756441  , ..., -6.6373706 ,
         -6.605947  , -6.5995436 ],
        [-0.42662624, -4.176383  , -5.122679  , ..., -6.5414276 ,
         -6.5389147 , -6.5705695 ],
        ...,
        [-0.5383989 , -4.5240765 , -5.186863  , ..., -6.5090423 ,
         -6.502405  , -6.480877  ],
        [-0.5170208 , -4.560294  , -5.08591   , ..., -6.5650644 ,
         -6.497587  , -6.523964  ],
        [-0.4799257 , -4.4987035 , -5.224111  , ..., -6.5554504 ,
         -6.535227  , -6.511924  ]], dtype=float32),
 array([[-0.60021526, -4.3629136 , -4.563427  , ..., -5.8111224 ,
         -5.834797  , -5.8093867 ],
        [-0.6049708 , -4.36707   , -4.5640197 , ..., -5.8075447 ,
         -5.830118  , -5.805607  ],
        [-0.5852223 , -4.305908  , -4.573474  , ..., -5.8769684 ,
         -5.8873005 , -5.811081  ],
        ...,
        [-0.6049764 , -4.4468246 , -4.8

In [22]:
start_logits, end_logits = predictions

In [23]:
compute_metrics(start_logits, end_logits, validation_dataset, raw_datasets["validation"])

  0%|          | 0/10570 [00:00<?, ?it/s]

{'exact_match': 0.6433301797540208, 'f1': 12.698477764102211}

In [24]:
# save the model
trainer.save_model('qa_model')

Saving model checkpoint to qa_model
Configuration saved in qa_model/config.json
Model weights saved in qa_model/pytorch_model.bin
tokenizer config file saved in qa_model/tokenizer_config.json
Special tokens file saved in qa_model/special_tokens_map.json


In [25]:
# use the pipeline with our model
qa = pipeline("question-answering", model='qa_model')

loading configuration file qa_model/config.json
Model config DistilBertConfig {
  "_name_or_path": "qa_model",
  "activation": "gelu",
  "architectures": [
    "DistilBertForQuestionAnswering"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.23.1",
  "vocab_size": 28996
}

loading configuration file qa_model/config.json
Model config DistilBertConfig {
  "_name_or_path": "qa_model",
  "activation": "gelu",
  "architectures": [
    "DistilBertForQuestionAnswering"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_

In [26]:
# test out the pipeline
context = "Today I went to the store to get some cookies."
question = "What did I buy?"

qa(context=context, question=question)

{'score': 0.04340851679444313, 'start': 38, 'end': 45, 'answer': 'cookies'}