## Question Answering LLM Fine-tuning

### Todo: ensure these are in requirements.txt and version compatability

In [12]:
import os
import sys
import pandas
import pickle
import json
import torch
import numpy
import warnings
warnings.filterwarnings('ignore') #Some operations warn inside a loop

## Listing 14.8

In [13]:
def get_processor_type():
    gpu_device = torch.device("cuda:0")
    cpu_device = torch.device("cpu")
    type = gpu_device if gpu_device else cpu_device
    return type

In [14]:
get_processor_type()

device(type='cuda', index=0)

### Grab a pre-generated copy of the golden set in case you skipped training it in Listing 14.7

In [15]:
![ ! -d "question-answering" ] && git clone --depth=1 https://github.com/ai-powered-search/question-answering
![ -d "question-answering" ] && cd question-answering && git pull 
!mkdir -p data

Cloning into 'question-answering'...
remote: Enumerating objects: 16, done.[K
remote: Counting objects: 100% (16/16), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 16 (delta 2), reused 14 (delta 2), pack-reused 0[K
Receiving objects: 100% (16/16), 92.27 KiB | 1.28 MiB/s, done.
Resolving deltas: 100% (2/2), done.
Already up to date.


In [16]:
import transformers
tokenizer = transformers.RobertaTokenizerFast.from_pretrained('roberta-base')
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
tokenizer

PreTrainedTokenizerFast(name_or_path='roberta-base', vocab_size=50265, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

## Listing 14.9

### Hyperparameter alert!

Hyperparameters are serious business.  Memory and Computation resources are very very finite.  We do our best to limit visible scope, both for the model and for the speed.  We also need to do this since the tensors we use during training and evaluation must have a fixed shape.  This shape must be the same for all examples we provide to the trainer and evaluator.

We accomplish this with a window sliding technique and by right-padding.  Windowing and padding will make sure everything is the same shape.

In [25]:
#This method adopted from the following example notebook:
#https://github.com/huggingface/notebooks/blob/master/examples/question_answering.ipynb
#Copyright 2021, Huggingface.  Apache 2.0 license.
import datasets, transformers

file = "../data/outdoors/question-answering-training-set"
datadict = datasets.load_from_disk(file)

def tokenize_dataset(examples):

    maximum_tokens = 384 # This will be the number of tokens in BOTH the question and context
    document_overlap = 128 # Sometimes we need to split the context into smaller chunks, so we will overlap with this window
    pad_on_right = tokenizer.padding_side == "right"
    
    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=maximum_tokens,
        stride=document_overlap,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )
    
    print(tokenized_examples[0])

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples
"""
To apply this function on all the sentences (or pairs of sentences) in our dataset, we just use the map method of our dataset object we created earlier. 
This will apply the function on all the elements of all the splits in dataset, so our training, validation and testing data will be preprocessed in one single command. 
Since our preprocessing changes the number of samples, we need to remove the old columns when applying it.
 --Huggingface
"""
tokenized_datasets = datadict.map(tokenize_dataset, batched=True, remove_columns=datadict["train"].column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

Encoding(num_tokens=384, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


  0%|          | 0/1 [00:00<?, ?ba/s]

Encoding(num_tokens=384, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


  0%|          | 0/1 [00:00<?, ?ba/s]

Encoding(num_tokens=384, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


In [26]:
tokenized_datasets.save_to_disk("../data/question-answering/qa-training-set-tokenized")

## Listing 14.10

In [27]:
from transformers import RobertaForQuestionAnswering, TrainingArguments, Trainer, default_data_collator
import torch

model = RobertaForQuestionAnswering.from_pretrained('deepset/roberta-base-squad2')

training_args = TrainingArguments(
    evaluation_strategy="epoch",                        # evaluate loss per epoch
    num_train_epochs=3,                                   # total # of training epochs
    per_device_train_batch_size=16,                       # batch size per device during training
    per_device_eval_batch_size=64,                        # batch size for evaluation
    warmup_steps=500,                                     # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                                    # strength of weight decay
    logging_dir="../data/question-answering/logs",        # directory for storing logs
    output_dir="../data/question-answering/results")     # output directory

trainer = Trainer(
    model=model,                                          # the instantiated 🤗 Transformers model to be trained
    args=training_args,                                   # training arguments, defined above
    data_collator=default_data_collator,                  
    tokenizer=tokenizer,                                  
    train_dataset=tokenized_datasets['train'],            # training dataset
    eval_dataset=tokenized_datasets['test'])               # evaluation dataset

loading configuration file config.json from cache at /home/jovyan/.cache/huggingface/hub/models--deepset--roberta-base-squad2/snapshots/cbf50ba81465d4d8676b8bab348e31835147541b/config.json
Model config RobertaConfig {
  "architectures": [
    "RobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "language": "english",
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "name": "Roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file model.safetensors from cache at /home/jovyan/.cache/huggingface

## Listing 14.11

In [28]:
trainer.train()

***** Running training *****
  Num examples = 160
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 30
  Number of trainable parameters = 124056578


Epoch,Training Loss,Validation Loss
1,No log,2.462209
2,No log,2.278262
3,No log,2.195303


***** Running Evaluation *****
  Num examples = 45
  Batch size = 64
***** Running Evaluation *****
  Num examples = 45
  Batch size = 64
***** Running Evaluation *****
  Num examples = 45
  Batch size = 64


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=30, training_loss=2.3616045633951823, metrics={'train_runtime': 337.7788, 'train_samples_per_second': 1.421, 'train_steps_per_second': 0.089, 'total_flos': 94066832424960.0, 'train_loss': 2.3616045633951823, 'epoch': 3.0})

In [36]:
trainer.save_model("data/outdoors/roberta-base-squad2-outdoors")

Saving model checkpoint to data/outdoors/roberta-base-squad2-outdoors
Configuration saved in data/outdoors/roberta-base-squad2-outdoors/config.json
Model weights saved in data/outdoors/roberta-base-squad2-outdoors/pytorch_model.bin
tokenizer config file saved in data/outdoors/roberta-base-squad2-outdoors/tokenizer_config.json
Special tokens file saved in data/outdoors/roberta-base-squad2-outdoors/special_tokens_map.json


## Listing 14.12

In [30]:
trainer.evaluate(eval_dataset=tokenized_datasets["validation"])
#{'eval_loss': 1.7851890325546265,
# 'eval_runtime': 3.1069,
# 'eval_samples_per_second': 4.828,
# 'eval_steps_per_second': 0.322,
# 'epoch': 3.0}

***** Running Evaluation *****
  Num examples = 10
  Batch size = 64


{'eval_loss': 1.8934662342071533,
 'eval_runtime': 1.7569,
 'eval_samples_per_second': 5.692,
 'eval_steps_per_second': 0.569,
 'epoch': 3.0}

## Listing 14.13

In [37]:
import tqdm
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline

device = get_processor_device()
outdoors_model = "data/outdoors/roberta-base-squad2-outdoors"
nlp2 = pipeline("question-answering", model=outdoors_model, tokenizer=outdoors_model,
                device=device)

loading configuration file data/outdoors/roberta-base-squad2-outdoors/config.json
Model config RobertaConfig {
  "_name_or_path": "data/outdoors/roberta-base-squad2-outdoors",
  "architectures": [
    "RobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "language": "english",
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "name": "Roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.25.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading configuration file data/outdoors/roberta-base-squad2-outdoors/config.json
Model config Rob

## Listing 14.14

In [38]:
def answer_questions(examples):
    answers = []
    success = 0
    for e in examples:
        question = {"question": e["question"][0], "context": e["context"][0]}
        answer = nlp2(question)
        label = e["answers"][0]["text"][0]
        result = answer["answer"]
        print(question["question"])
        print("Label:", label)
        print("Result:", result)
        print("----------")
        success += (1 if (label == result) else 0)
        answers.append(answer)
    print(f"{success}/{len(examples)} correct")
    return answers

In [39]:
datadict["validation"].set_format(type="pandas",output_all_columns=True)
validation_examples = [example for example in datadict["validation"]]
validation_results = answer_questions(validation_examples)

What constitutes mountain exposure when hiking or scrambling?
Label: your level of risk of sustaining a serious injury or dying
Result: your level of risk of sustaining a serious injury or dying
----------
What are the advantages of a vapour barrier?
Label: everything stays dry at least from the inside
Result: everything stays dry at least from the inside
----------
When does a Rope become a Line?
Label: being involved in a sailing craft
Result: If I am going to the hardware store
----------
How to light a fire with wet firewood?
Label: using lots of extra kindling
Result: using lots of extra kindling
----------
What is a "sling belay?"
Label: A hanging belay with no stance
Result: A hanging belay with no stance
----------
What should I do for a dislocated shoulder?
Label: fix it
Result: You really need to fix it
----------
What is a "fast pack" setup for a tent?
Label: just the fly, tent poles, and usually a groundsheet
Result: just the fly, tent poles, and usually a groundsheet
-----

In [None]:
#This is an illustration of grid search.  For the Transformers builtin, see https://huggingface.co/transformers/main_classes/trainer.html#transformers.Trainer.hyperparameter_search

from transformers import RobertaForQuestionAnswering, TrainingArguments, Trainer, default_data_collator
import torch

def grid_search_finetuning(tokenized_datasets):
    epochs = [4]
    batches = [16, 18]
    warmups = [50, 250, 500]
  
    for epoch in epochs:
        for batch in batches:
            for warmup in warmups:
                model = RobertaForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")
                name = "_".join(["epochs", str(epoch), "batchsize", str(batch), "warmup", str(warmup)])

                print("-----------------------------------------------\n")
                print(name)
                training_args = TrainingArguments(
                    evaluation_strategy = "epoch",                         # evaluate loss per epoch
                    num_train_epochs=epoch,                                # total # of training epochs
                    per_device_train_batch_size=batch,                     # batch size per device during training
                    per_device_eval_batch_size=64,                         # batch size for evaluation
                    warmup_steps=warmup,                                   # number of warmup steps for learning rate scheduler
                    weight_decay=0.01,                                     # strength of weight decay
                    logging_dir="../data/questionanswering/logs_" + name,  # directory for storing logs
                    output_dir="../data/questionanswering/results_" + name # output directory
                )

                trainer = Trainer(
                    model=model,                                          # the instantiated 🤗 Transformers model to be trained
                    args=training_args,                                   # training arguments, defined above
                    data_collator=default_data_collator,                  
                    tokenizer=tokenizer,                                  
                    train_dataset=tokenized_datasets["train"],            # training dataset
                    eval_dataset=tokenized_datasets["test"]               # evaluation dataset
                )

                training_outputs = trainer.train()
                print("\nTraining Loss:", training_outputs.training_loss)
                evaluation_outputs = trainer.evaluate(eval_dataset=tokenized_datasets["validation"])
                print("Evaluation Loss:", evaluation_outputs["eval_loss"])
                print(training_outputs)
                print(evaluation_outputs)

                del trainer
                del model

grid_search_finetuning(tokenized_datasets)

loading configuration file config.json from cache at /home/jovyan/.cache/huggingface/hub/models--deepset--roberta-base-squad2/snapshots/e84d19c1ab20d7a5c15407f6954cef5c25d7a261/config.json
Model config RobertaConfig {
  "architectures": [
    "RobertaForQuestionAnswering"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "language": "english",
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "name": "Roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}

loading weights file model.safetensors from cache at /home/jovyan/.cache/huggingface

-----------------------------------------------

epochs_4_batchsize_16_warmup_50


Epoch,Training Loss,Validation Loss
1,No log,2.089826
2,No log,1.5377


***** Running Evaluation *****
  Num examples = 44
  Batch size = 64
***** Running Evaluation *****
  Num examples = 44
  Batch size = 64


KeyboardInterrupt: 

Up next: [Question Answering Demo Application](4.question-answering-CPU-demo-application.ipynb)