In [None]:
import os
import sys
import pandas
import pickle
import json
import torch
import numpy
import warnings
warnings.filterwarnings('ignore') #Some operations warn inside a loop

## Listing 14.8

In [None]:
device = torch.device("cuda:0")
n_gpu = torch.cuda.device_count()
print(torch.cuda.get_device_name())
print(device)

Tesla T4
cuda:0


In [None]:
#Grant access to your local g-drive
from google.colab import drive
drive.mount('/content/drive/')
path = '/content/drive/My Drive/Colab Notebooks/aips/ch13/'

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## Listing 14.9

In [71]:
import datasets
datadict = datasets.load_from_disk(path+'data/question-answering-training-set')
assert isinstance(datadict, datasets.DatasetDict)
datadict

DatasetDict({
    train: Dataset({
        features: ['__index_level_0__', 'answers', 'context', 'id', 'question', 'title', 'url'],
        num_rows: 125
    })
    test: Dataset({
        features: ['__index_level_0__', 'answers', 'context', 'id', 'question', 'title', 'url'],
        num_rows: 32
    })
    validation: Dataset({
        features: ['__index_level_0__', 'answers', 'context', 'id', 'question', 'title', 'url'],
        num_rows: 10
    })
})

## Listing 14.10

In [None]:
import transformers
tokenizer = transformers.RobertaTokenizerFast.from_pretrained('roberta-base')
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
tokenizer

PreTrainedTokenizerFast(name_or_path='roberta-base', vocab_size=50265, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)})

### Hyperparameter alert!

Hyperparameters are serious business.  Memory and Computation resources are very very finite.  We do our best to limit visible scope, both for the model and for the speed.  We also need to do this since the tensors we use during training and evaluation must have a fixed shape.  This shape must be the same for all examples we provide to the trainer and evaluator.

We accomplish this with a window sliding technique and by right-padding.  Windowing and padding will make sure everything is the same shape.

In [None]:
#This method adopted from the following example notebook:
#https://github.com/huggingface/notebooks/blob/master/examples/question_answering.ipynb
#Copyright 2021, Huggingface.  Apache 2.0 license.
def tokenize_dataset(examples):

    maximum_tokens = 384 # This will be the number of tokens in BOTH the question and context
    document_overlap = 128 # Sometimes we need to split the context into smaller chunks, so we will overlap with this window
    pad_on_right = tokenizer.padding_side == "right"
    
    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=maximum_tokens,
        stride=document_overlap,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )
    
    print(tokenized_examples[0])

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples
"""
To apply this function on all the sentences (or pairs of sentences) in our dataset, we just use the map method of our dataset object we created earlier. 
This will apply the function on all the elements of all the splits in dataset, so our training, validation and testing data will be preprocessed in one single command. 
Since our preprocessing changes the number of samples, we need to remove the old columns when applying it.
 --Huggingface
"""
tokenized_datasets = datadict.map(tokenize_dataset, batched=True, remove_columns=datadict["train"].column_names)

Encoding(num_tokens=384, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Encoding(num_tokens=384, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

Encoding(num_tokens=384, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Encoding(num_tokens=384, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

Encoding(num_tokens=384, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

Encoding(num_tokens=384, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])



In [None]:
tokenized_datasets.save_to_disk(path+'data/question-answering-training-set-tokenized')

## Listing 14.11

In [None]:
from transformers import RobertaForQuestionAnswering, TrainingArguments, Trainer, default_data_collator
import torch

model = RobertaForQuestionAnswering.from_pretrained('deepset/roberta-base-squad2')

training_args = TrainingArguments(
    output_dir=path+'data/questionanswering/results',     # output directory
    evaluation_strategy = "epoch",                        # evaluate loss per epoch
    num_train_epochs=3,                                   # total # of training epochs
    per_device_train_batch_size=16,                       # batch size per device during training
    per_device_eval_batch_size=64,                        # batch size for evaluation
    warmup_steps=500,                                     # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                                    # strength of weight decay
    logging_dir=path+'data/questionanswering/logs'        # directory for storing logs
)

trainer = Trainer(
    model=model,                                          # the instantiated 🤗 Transformers model to be trained
    args=training_args,                                   # training arguments, defined above
    data_collator=default_data_collator,                  
    tokenizer=tokenizer,                                  
    train_dataset=tokenized_datasets['train'],            # training dataset
    eval_dataset=tokenized_datasets['test']               # evaluation dataset
)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=571.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=496313727.0, style=ProgressStyle(descri…




## Listing 14.12

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
1,No log,2.177553,1.0082,43.642
2,No log,2.011696,1.0278,42.811
3,No log,1.938573,1.0477,41.996


TrainOutput(global_step=30, training_loss=2.531823984781901, metrics={'train_runtime': 37.1978, 'train_samples_per_second': 0.806, 'total_flos': 133766734473216, 'epoch': 3.0})

In [None]:
trainer.save_model(path+"data/roberta-base-squad2-outdoors")

## Listing 14.13

In [None]:
trainer.evaluate(eval_dataset=tokenized_datasets['validation'])

{'epoch': 3.0,
 'eval_loss': 1.773618459701538,
 'eval_runtime': 0.4442,
 'eval_samples_per_second': 33.767}

## Listing 14.14

In [None]:
import tqdm
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline
outdoors_model = path+"data/roberta-base-squad2-outdoors"
nlp2 = pipeline('question-answering', model=outdoors_model, tokenizer=outdoors_model,device=0)

Some weights of RobertaModel were not initialized from the model checkpoint at /content/drive/My Drive/Colab Notebooks/aips/ch13/data/roberta-base-squad2-outdoors and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Listing 14.15

In [97]:
def answer_questions(examples):
    answers = []
    success = 0
    for example in examples:
        question = {'question':example['question'][0],'context':example['context'][0]}
        answer = nlp2(question)
        label = example['answers'][0]['text'][0]
        result = answer['answer']
        print(question['question'])
        print('Label:',label)
        print('Result:',result)
        print('----------')
        success += (1 if (label==result) else 0)
        answers.append(answer)
    print(success,'/',len(examples),'Correct!')
    return answers

In [98]:
datadict['validation'].set_format(type='pandas',output_all_columns=True)
validation_examples = [example for example in datadict['validation']]
validation_results = answer_questions(validation_examples)

How to get pine sap off my teeth
Label: Take a small amount of margarine and rub on the sap
Result: Take a small amount of margarine and rub on the sap
----------
Why are backpack waist straps so long?
Label: The most backpacks have only one size for everyone
Result: The most backpacks have only one size for everyone
----------
What can I do to prevent altitude sickness?
Label: acclimate
Result: acclimate
----------
What group of people call themselves "Outdoor Influencers", and what do they do regarding natural areas of land?
Label: raise awareness for important causes to protect these lands
Result: raise awareness for important causes to protect these lands
----------
When to sharpen crampons?
Label: when I am expecting icy conditions
Result: when I am expecting icy conditions
----------
What is the benefit to telemark skiing?
Label: allow skiers to skin up back-country slopes with a more natural and efficient stride
Result: more natural and efficient stride
----------
What do you do

In [None]:
#This is an illustration of grid search.  For the Transformers builtin, see https://huggingface.co/transformers/main_classes/trainer.html#transformers.Trainer.hyperparameter_search

from transformers import RobertaForQuestionAnswering, TrainingArguments, Trainer, default_data_collator
import torch

def grid_search_finetuning(tokenized_datasets):
    epochs=[4]
    batches=[16,18]
    warmups=[50,250,500]
  
    for epoch in epochs:
        for batch in batches:
            for warmup in warmups:
                model = RobertaForQuestionAnswering.from_pretrained('deepset/roberta-base-squad2')
                name = '_'.join(['epochs',str(epoch),'batchsize',str(batch),'warmup',str(warmup)])

                print('-----------------------------------------------')
                print()
                print(name)
                print()

                training_args = TrainingArguments(
                    evaluation_strategy = "epoch",                         # evaluate loss per epoch
                    num_train_epochs=epoch,                                # total # of training epochs
                    per_device_train_batch_size=batch,                     # batch size per device during training
                    per_device_eval_batch_size=64,                         # batch size for evaluation
                    warmup_steps=warmup,                                   # number of warmup steps for learning rate scheduler
                    weight_decay=0.01,                                     # strength of weight decay
                    logging_dir=path+'data/questionanswering/logs_'+name,  # directory for storing logs
                    output_dir=path+'data/questionanswering/results_'+name # output directory
                )

                trainer = Trainer(
                    model=model,                                          # the instantiated 🤗 Transformers model to be trained
                    args=training_args,                                   # training arguments, defined above
                    data_collator=default_data_collator,                  
                    tokenizer=tokenizer,                                  
                    train_dataset=tokenized_datasets['train'],            # training dataset
                    eval_dataset=tokenized_datasets['test']               # evaluation dataset
                )

                training_outputs = trainer.train()
                print('Training Loss:',training_outputs.training_loss)
                print('Evaluation Loss:',evaluation_outputs['eval_loss'])
                evaluation_outputs = trainer.evaluate(eval_dataset=tokenized_datasets['validation'])
                print(training_outputs)
                print(evaluation_outputs)

                del trainer
                del model

grid_search_finetuning(tokenized_datasets)