In [None]:
import json

## Read Dataset (Archive)

In [None]:
from collections.abc import Iterable

# To return a string with the contexts concatenated

def flatten(xs):
    for x in xs:
        if isinstance(x, Iterable) and not isinstance(x, (str, bytes)):
            yield from flatten(x)
        else:
            yield x

In [None]:
# To convert data into 3 lists: contexts, questions and answers respectively
# NOTE: Some questions would be inadvertenly be removed becuase they contain yes/no answers and will hence, be removed from the corpus

def read_data(path):
    with open(path, 'r') as f:
      data = json.load(f)
    
    contexts = []
    questions = []
    answers = []
    
    for group in data:
        # Removing yes/no questions not found in the context
        if "yes" in group['answer'] or "no" in group['answer']:
            continue
        contexts.append(''.join(flatten(group['context'])))
        questions.append(group['question'])
        answers.append(group['answer'])
        
    return contexts, questions, answers

In [None]:
train_contexts, train_questions, train_answers = read_data('train_set.json')
val_contexts, val_questions, val_answers = read_data('dev_set.json')

FileNotFoundError: ignored

In [None]:
print(len(train_contexts))

🔦 Entire train dataset has length 83159, let's only use 2500 for initial coding 

In [None]:
train_contexts, train_questions, train_answers = train_contexts[:2500], train_questions[:2500], train_answers[:2500]
val_contexts, val_questions, val_answers = val_contexts[:2500], val_questions[:2500], val_answers[:2500]

In [None]:
# The DistilBERT takes in input answer_start (start index of answer in context) and answer_end (end index of answer in context).
# Return a list of dictionaries with {answer_text:answerstring, answer_start: start_idx, answer_end: end_idx}

def generate_answer_index(answers, contexts):
    out = []
    for answer, context in zip(answers,contexts):
        gold_text = answer
        start_idx = context.find(answer)
        # There are some yes/no answers not found in the context
        if start_idx == -1:
            print(answer)
        end_idx = start_idx + len(gold_text)
        if context[start_idx:end_idx] == gold_text:
            out.append({'text':answer, 'answer_start':start_idx, 'answer_end':end_idx})
        else:
            for n in [1,2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                    out.append({'answer_text':answer, 'answer_start':start_idx, 'answer_end':end_idx})
    return out

In [None]:
train_answers_dict = generate_answer_index(train_answers,train_contexts)
val_answers_dict = generate_answer_index(val_answers,val_contexts)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir("drive/My Drive/natural-language-questions-dataset")

## Set Constants, Import Packages

In [1]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m80.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m123.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [2]:
ARCHITECTURE_NAME = 'distilbert-base-uncased'
max_length = 500 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.

In [3]:
from transformers import AutoTokenizer

# Tokenise input using DistilBERT tokeniser from pretrained 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(ARCHITECTURE_NAME)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

## Detour to understand tokenizer and data processing

Tokenizers are not commutative

In [None]:
tokenizer("What is your name","Hi")

{'input_ids': [101, 2054, 2003, 2115, 2171, 102, 7632, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokenizer("Hi", "What is your name")

{'input_ids': [101, 7632, 102, 2054, 2003, 2115, 2171, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
max_length = 512 # The maximum length of a feature (question and context)
doc_stride = 128 # The authorized overlap between two part of the context when splitting it is needed.

Using first dataset as an example of a set of question and contexts that exceed the maximum length of a feature

In [None]:
print(train_contexts[0])

Radio City (Indian radio station)Radio City is India's first private FM radio station and was started on 3 July 2001. It broadcasts on 91.1 (earlier 91.0 in most cities) megahertz from Mumbai (where it was started in 2004), Bengaluru (started first in 2001), Lucknow and New Delhi (since 2003). It plays Hindi, English and regional songs. It was launched in Hyderabad in March 2006, in Chennai on 7 July 2006 and in Visakhapatnam October 2007. Radio City recently forayed into New Media in May 2008 with the launch of a music portal - PlanetRadiocity.com that offers music related news, videos, songs, and other music-related features. The Radio station currently plays a mix of Hindi and Regional music. Abraham Thomas is the CEO of the company.History of Albanian footballFootball in Albania existed before the Albanian Football Federation (FSHF) was created. This was evidenced by the team's registration at the Balkan Cup tournament during 1929-1931, which started in 1929 (although Albania event

In [None]:
print(len(train_contexts[0]))
print(len(train_questions[0]))
print(f'Token of {len(tokenizer(train_contexts[0], train_questions[0])["input_ids"])} is longer than the allowed length of 384')

5000
70
Token of 1075 is longer than the allowed length of 384


In [None]:
# Since max model input is 512
# tokenizer(batch_sentences, padding=True, truncation=True, max_length=42)
# creates padding to max sequence in batch which is also maximum size of the model
# only_second makes sure that only context gets truncated, not the questions input

# only_second context is truncated
# padding to ensure each is 512 length
# return_overflowing_tokens = ensure that overflowing tokens are treated as new features
# stride = allowed overlap for context that is split
tokenized_example = tokenizer(
    train_questions[0],
    train_contexts[0],
    max_length=max_length,
    truncation="only_second",
    padding=True,
    return_overflowing_tokens=True,
    stride=doc_stride,
    return_offsets_mapping=True,
)

In [None]:
[len(x) for x in tokenized_example["input_ids"]]

[512, 512, 512]

In [None]:
for x in tokenized_example["input_ids"]:
    print(tokenizer.decode(x))

[CLS] which magazine was started first arthur's magazine or first for women? [SEP] radio city ( indian radio station ) radio city is india's first private fm radio station and was started on 3 july 2001. it broadcasts on 91. 1 ( earlier 91. 0 in most cities ) megahertz from mumbai ( where it was started in 2004 ), bengaluru ( started first in 2001 ), lucknow and new delhi ( since 2003 ). it plays hindi, english and regional songs. it was launched in hyderabad in march 2006, in chennai on 7 july 2006 and in visakhapatnam october 2007. radio city recently forayed into new media in may 2008 with the launch of a music portal - planetradiocity. com that offers music related news, videos, songs, and other music - related features. the radio station currently plays a mix of hindi and regional music. abraham thomas is the ceo of the company. history of albanian footballfootball in albania existed before the albanian football federation ( fshf ) was created. this was evidenced by the team's reg

In [None]:
print(tokenized_example["offset_mapping"][0][:100])
print(len(tokenized_example["offset_mapping"][0]))
print(tokenized_example["offset_mapping"][1][:100])



[(0, 0), (0, 5), (6, 14), (15, 18), (19, 26), (27, 32), (33, 39), (39, 40), (40, 41), (42, 50), (51, 53), (54, 59), (60, 63), (64, 69), (69, 70), (0, 0), (0, 5), (6, 10), (11, 12), (12, 18), (19, 24), (25, 32), (32, 33), (33, 38), (39, 43), (44, 46), (47, 52), (52, 53), (53, 54), (55, 60), (61, 68), (69, 71), (72, 77), (78, 85), (86, 89), (90, 93), (94, 101), (102, 104), (105, 106), (107, 111), (112, 116), (116, 117), (118, 120), (121, 131), (132, 134), (135, 137), (137, 138), (138, 139), (140, 141), (141, 148), (149, 151), (151, 152), (152, 153), (154, 156), (157, 161), (162, 168), (168, 169), (170, 174), (174, 177), (177, 179), (180, 184), (185, 191), (192, 193), (193, 198), (199, 201), (202, 205), (206, 213), (214, 216), (217, 221), (221, 222), (222, 223), (224, 230), (230, 233), (234, 235), (235, 242), (243, 248), (249, 251), (252, 256), (256, 257), (257, 258), (259, 266), (267, 270), (271, 274), (275, 280), (281, 282), (282, 287), (288, 292), (292, 293), (293, 294), (295, 297), (2

Offset mapping is a list of tuples where each tuple is representative of a word in the original text (start_index, end_index)

In [None]:
#(0,5) refers to radio and (6,10) refers to city
first_token_id = tokenized_example["input_ids"][0][1]
offsets1 = tokenized_example["offset_mapping"][0][1]
print(f'{tokenizer.convert_ids_to_tokens([first_token_id])[0]} should be the same as {train_contexts[0][offsets1[0]:offsets1[1]]}')


which should be the same as Radio


In [None]:
# Note that in the sequence_ids() there is this follow values
# .... , 0, 0, None, 1, 1, 1, ...
# where 0,0... refers to the question and 1, 1, 1 refers to the context which has not been truncated
sequence_ids = tokenized_example.sequence_ids()
print(sequence_ids)

[None, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, None, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [None]:
print(train_answers[0])
print(train_contexts[0][3013:3020])

Arthur's Magazine
m.Arthu


In [None]:
answers = train_answers
start_char = train_contexts[0].find(train_answers[0])
end_char = start_char + len(train_answers[0])

# Start token index of the current span in the text.
token_start_index = 0
while sequence_ids[token_start_index] != 1:
    token_start_index += 1

# End token index of the current span in the text.
token_end_index = len(tokenized_example["input_ids"][0]) - 1
while sequence_ids[token_end_index] != 1:
    token_end_index -= 1

# Since offsetmapping is a list of offset mapping of len 512
# iterate over each offset_mapping
# see if the answer is found in any of the tokens
# as shown below, it is found in the 2nd token at index 277
for i in range(len(tokenized_example["offset_mapping"])):
  offsets = tokenized_example["offset_mapping"][i]
  if (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
      # Move the token_start_index and token_end_index to the two ends of the answer.
      # Note: we could go after the last offset if the answer is the last word (edge case).
      while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
          token_start_index += 1
      start_position = token_start_index - 1
      while offsets[token_end_index][1] >= end_char:
          token_end_index -= 1
      end_position = token_end_index + 1
      print(start_position, end_position)
  else:
      print("The answer is not in this feature.")

The answer is not in this feature.
277 280
The answer is not in this feature.


In [None]:
print(tokenizer.decode(tokenized_example["input_ids"][1][start_position: end_position+1]))
print(train_answers[0])

arthur's magazine
Arthur's Magazine


In [None]:
pad_on_right = tokenizer.padding_side == "right"

In [None]:
def update_train_answers(answers, contexts):
    out = []
    for answer, context in zip(answers,contexts):
        gold_text = answer
        start_idx = context.find(answer)
        # There are some yes/no answers not found in the context
        if start_idx == -1:
            out.append({'text':[answer], 'answer_start':[]})
            continue
        end_idx = start_idx + len(gold_text)

        if context[start_idx:end_idx] == gold_text:
            out.append({'text':[answer], 'answer_start':[start_idx]})
        else:
            for n in [1,2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                  out.append({'text':[answer], 'answer_start':[start_idx]})
                else:
                  out.append({'text':[answer], 'answer_start':[]})
    return out


In [None]:
train_answers2 = update_train_answers(train_answers,train_contexts)

In [None]:
print(len(train_answers2))
print(train_answers2[1])

2500
{'text': ['Delhi'], 'answer_start': [3177]}


In [None]:
def prepare_train_features(question, context, answer):
    examples = {}
    examples["question"], examples["context"], examples["answers"] = question, context, answer
    
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [None]:
features = prepare_train_features(train_questions[:1],train_contexts[:1],train_answers2[:1])

In [None]:
print(features)
print(features.keys())

{'input_ids': [[101, 2029, 2932, 2001, 2318, 2034, 4300, 1005, 1055, 2932, 2030, 2034, 2005, 2308, 1029, 102, 2557, 2103, 1006, 2796, 2557, 2276, 1007, 2557, 2103, 2003, 2634, 1005, 1055, 2034, 2797, 4718, 2557, 2276, 1998, 2001, 2318, 2006, 1017, 2251, 2541, 1012, 2009, 8960, 2006, 6205, 1012, 1015, 1006, 3041, 6205, 1012, 1014, 1999, 2087, 3655, 1007, 13164, 5886, 5753, 2013, 8955, 1006, 2073, 2009, 2001, 2318, 1999, 2432, 1007, 1010, 8191, 14129, 1006, 2318, 2034, 1999, 2541, 1007, 1010, 23571, 1998, 2047, 6768, 1006, 2144, 2494, 1007, 1012, 2009, 3248, 9269, 1010, 2394, 1998, 3164, 2774, 1012, 2009, 2001, 3390, 1999, 13624, 1999, 2233, 2294, 1010, 1999, 12249, 2006, 1021, 2251, 2294, 1998, 1999, 9425, 15256, 4502, 2102, 13129, 2255, 2289, 1012, 2557, 2103, 3728, 2005, 4710, 2098, 2046, 2047, 2865, 1999, 2089, 2263, 2007, 1996, 4888, 1997, 1037, 2189, 9445, 1011, 4774, 12173, 3695, 12972, 1012, 4012, 2008, 4107, 2189, 3141, 2739, 1010, 6876, 1010, 2774, 1010, 1998, 2060, 2189, 1011,

Output of features is a list.
This list contains tokens.
<br>
Each token has input_ids -> questions + answers where return_offsets_mapping distinguishes the question and answer tokens
<br>
Start and end positions give the tokens for the start and end positions

In [None]:
features = prepare_train_features(train_questions[:1],train_contexts[:1],train_answers2[:1])

2500


In [None]:
features = prepare_train_features(train_questions,train_contexts,train_answers2)

## Preprocessing

In [4]:
! pip install datasets transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.8.0-py3-none-any.whl (452 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m452.9/452.9 KB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash
  Downloading xxhash-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting urllib3<1.27,>=1.21.1
  Downloading urllib3-1.26.13-py2.py3-none-any.whl (140 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 KB[0m [31

In [5]:
from datasets import load_dataset, load_metric

In [6]:
BATCH_SIZE = 16

In [7]:
datasets = load_dataset("hotpot_qa","fullwiki")

Downloading builder script:   0%|          | 0.00/6.42k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/5.93k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.21k [00:00<?, ?B/s]

Downloading and preparing dataset hotpot_qa/fullwiki to /root/.cache/huggingface/datasets/hotpot_qa/fullwiki/1.0.0/133b9501f892e5193babbad937bee3b4899deb4691ef4d791e6ac0111c875bb5...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/566M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/47.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/46.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/90447 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/7405 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7405 [00:00<?, ? examples/s]

Dataset hotpot_qa downloaded and prepared to /root/.cache/huggingface/datasets/hotpot_qa/fullwiki/1.0.0/133b9501f892e5193babbad937bee3b4899deb4691ef4d791e6ac0111c875bb5. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [8]:
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context'],
        num_rows: 90447
    })
    validation: Dataset({
        features: ['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context'],
        num_rows: 7405
    })
    test: Dataset({
        features: ['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context'],
        num_rows: 7405
    })
})

In [9]:
# print(type(datasets["train"]['context'][0]))
# print((datasets["train"]['context'][0]).keys())
# print(datasets["train"]['context'][0])
# print(datasets["train"]['answer'][0])
# print(datasets["train"]['question'][0])

# values = ' '.join(map(str, datasets["train"]['context'][0].values()))
# print(values)
# print(type(values))

In [10]:
pad_on_right = tokenizer.padding_side == "right"

In [11]:
def update_train_answers(answers, contexts):
    out = []
    for answer, context in zip(answers,contexts):
        gold_text = answer
        if answer is None:
            out.append({'text':[""], 'answer_start':[]})
            continue
        start_idx = context.find(gold_text)
        # There are some yes/no answers not found in the context
        if start_idx == -1:
            out.append({'text':[answer], 'answer_start':[]})
            continue
        end_idx = start_idx + len(gold_text)

        if context[start_idx:end_idx] == gold_text:
            out.append({'text':[answer], 'answer_start':[start_idx]})
        else:
            for n in [1,2]:
                if context[start_idx-n:end_idx-n] == gold_text:
                  out.append({'text':[answer], 'answer_start':[start_idx]})
                else:
                  out.append({'text':[answer], 'answer_start':[]})
    return out

In [12]:
def prepare_train_features(examples):
  
    questions, contexts, answers = examples["question"], examples["context"], examples["answer"]
    # TODO: Contexts are in dictionary form and needs to be converted into list
    for i in range(len(contexts)):
        values = ' '.join(map(str, contexts[i].values()))
        contexts[i] = values
    print("contexts",contexts[0])
    print(type(contexts[0]))

    # TODO: Answers are only in text and need start_id
    answers = update_train_answers(answers, contexts)
    # print("answers",answers[0])
    # print(type(answers[0]))

    examples["question"], examples["context"], examples["answer"] = questions, contexts, answers
    # print("example",type(examples["answer"][0]))

    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answer"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [13]:
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context'],
        num_rows: 90447
    })
    validation: Dataset({
        features: ['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context'],
        num_rows: 7405
    })
    test: Dataset({
        features: ['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context'],
        num_rows: 7405
    })
})

In [14]:
datasets[0]

KeyError: ignored

In [None]:
len(datasets)
type(datasets)
len(datasets["input_ids"])

In [None]:
# tokenized_datasets = datasets.map(prepare_train_features, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets = datasets.map(prepare_train_features, batched=True)

# Fine-tuning the model

In [None]:
# Checking cuda version
# ! nvcc -V

In [None]:
! pip install -U transformers

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
import torch 

if torch.cuda.is_available():
  print("cuda available")
  device = torch.device("cuda") 
  model = AutoModelForQuestionAnswering.from_pretrained(ARCHITECTURE_NAME)
  model.to(device)  
else:
  model = AutoModelForQuestionAnswering.from_pretrained(ARCHITECTURE_NAME)



Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

Trainer parameters: https://huggingface.co/docs/transformers/v4.24.0/en/main_classes/trainer#transformers.TrainingArguments

In [None]:
# ! pip install transformers datasets accelerate nvidia-ml-py3

In [None]:
MODEL_NAME = ARCHITECTURE_NAME.split("/")[-1]
args = TrainingArguments(
    f"{MODEL_NAME}-finetuned-hotpotqa",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
    no_cuda = True,
    gradient_accumulation_steps = 10
    )

In [None]:
# from pynvml import *


# def print_gpu_utilization():
#     nvmlInit()
#     handle = nvmlDeviceGetHandleByIndex(0)
#     info = nvmlDeviceGetMemoryInfo(handle)
#     print(f"GPU memory occupied: {info.used//1024**2} MB.")


# def print_summary(result):
#     print(f"Time: {result.metrics['train_runtime']:.2f}")
#     print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
#     print_gpu_utilization()

# print_gpu_utilization(). # GPU memory occupied: 842 MB.

In [None]:
# device = "cuda:0"
# inputs    = tokenizer(sentence, return_tensors="pt").to(device)
# model     = model.to(device)
# outputs   = model(**inputs)

In [None]:
from transformers import default_data_collator

data_collator = default_data_collator

In [None]:
# print(type(tokenized_datasets["train"])) # <class 'datasets.arrow_dataset.Dataset'>
# print(len(tokenized_datasets["train"])) # 368979

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer
    )

NameError: ignored

In [None]:
trainer.train()

NameError: ignored

In [None]:
trainer.save_model("distilbert-bacse-uncased-hotspotqa")

# Detour to understand evaluation

Answers of the model returns us the start and end indices as the answers of the model. We need to map this back to the actual answer

In [None]:
import torch

for batch in trainer.get_eval_dataloader():
    break
batch = {k: v.to(trainer.args.device) for k, v in batch.items()}
with torch.no_grad():
    output = trainer.model(**batch)
output.keys()

In [None]:
output.start_logits.shape, output.end_logits.shape

In [None]:
output.start_logits.argmax(dim=-1), output.end_logits.argmax(dim=-1)

In [None]:
# Choose only n best start and end indices predictions by the model
n_best_size = 5

In [None]:
def prepare_validation_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # We keep the example_id that gave us this feature and we will store the offset mappings.
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [None]:
validation_features = datasets["validation"].map(
    prepare_validation_features,
    batched=True,
    remove_columns=datasets["validation"].column_names
)

In [None]:
raw_predictions = trainer.predict(validation_features)

In [None]:
validation_features.set_format(type=validation_features.format["type"], columns=list(validation_features.features.keys()))

In [None]:
max_answer_length = 30

In [None]:
start_logits = output.start_logits[0].cpu().numpy()
end_logits = output.end_logits[0].cpu().numpy()
offset_mapping = validation_features[0]["offset_mapping"]
# The first feature comes from the first example. For the more general case, we will need to be match the example_id to
# an example index
context = datasets["validation"][0]["context"]

# Gather the indices the best start/end logits:
start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
valid_answers = []
for start_index in start_indexes:
    for end_index in end_indexes:
        # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
        # to part of the input_ids that are not in the context.
        if (
            start_index >= len(offset_mapping)
            or end_index >= len(offset_mapping)
            or offset_mapping[start_index] is None
            or offset_mapping[end_index] is None
        ):
            continue
        # Don't consider answers with a length that is either < 0 or > max_answer_length.
        if end_index < start_index or end_index - start_index + 1 > max_answer_length:
            continue
        if start_index <= end_index: # We need to refine that test to check the answer is inside the context
            start_char = offset_mapping[start_index][0]
            end_char = offset_mapping[end_index][1]
            valid_answers.append(
                {
                    "score": start_logits[start_index] + end_logits[end_index],
                    "text": context[start_char: end_char]
                }
            )

valid_answers = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[:n_best_size]
valid_answers

In [None]:
datasets["validation"][0]["answers"]

In [None]:
import collections

examples = datasets["validation"]
features = validation_features

example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
features_per_example = collections.defaultdict(list)
for i, feature in enumerate(features):
    features_per_example[example_id_to_index[feature["example_id"]]].append(i)

In [None]:
from tqdm.auto import tqdm

def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
    # Build a map example to its corresponding features.
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    # The dictionaries we have to fill.
    predictions = collections.OrderedDict()

    # Logging.
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
    for example_index, example in enumerate(tqdm(examples)):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_score = None # Only used if squad_v2 is True.
        valid_answers = []
        
        context = example["context"]
        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features[feature_index]["offset_mapping"]

            # Update minimum null prediction.
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # to part of the input_ids that are not in the context.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"text": "", "score": 0.0}
        
        # Let's pick our final answer: the best one or the null answer (only for squad_v2)
        if not squad_v2:
            predictions[example["id"]] = best_answer["text"]
        else:
            answer = best_answer["text"] if best_answer["score"] > min_null_score else ""
            predictions[example["id"]] = answer

    return predictions