# Setup

In [1]:
import transformers
import datasets
from datasets import load_dataset
import evaluate
import torch
import numpy as np

2026-01-14 21:25:27.613869: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
print("Transformers version:", transformers.__version__)
print("Pytorch version:", torch.__version__)
print("Dataset version:", datasets.__version__)
print("Evaluate version: ", evaluate.__version__)
!python -V

Transformers version: 4.57.3
Pytorch version: 2.2.2
Dataset version: 4.0.0
Evaluate version:  0.4.6
Python 3.11.0


# Loading the model and tokenizer

In [3]:
from transformers import AutoTokenizer
from transformers import DistilBertForQuestionAnswering

trained_checkpoint = "distilbert/distilbert-base-uncased"
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

tokenizer = AutoTokenizer.from_pretrained(trained_checkpoint)
model = DistilBertForQuestionAnswering.from_pretrained(trained_checkpoint).to(device)

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Load dataset

In [4]:
dataset = load_dataset("squad")
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

## Visualize some examples from the SQuAD dataset

In [5]:
# to make text bold
s_bold = '\033[1m'
e_bold = '\033[0;0m'

print(s_bold + 'Train Data Sample.....' + e_bold)
train_data = dataset["train"]
for data in train_data:
    print(' ')
    print(s_bold + 'ID -' + e_bold, data['id'])
    print(s_bold +'TITLE - '+ e_bold, data['title'])
    print(s_bold + 'CONTEXT - '+ e_bold,data['context'])
    print(s_bold + 'QUESTION - '+ e_bold,data['question'])
    print(s_bold + 'ANSWERS - ' + e_bold,data['answers']['text'])
    print(s_bold + 'ANSWERS START INDEX - ' + e_bold,data['answers']['answer_start'])
    print(' ')
    break
    
print('---'*30)   
print(s_bold + 'Validation Data Sample.....' + e_bold)
train_data = dataset["validation"]
for data in train_data:
    print(' ')
    print(s_bold + 'ID -' + e_bold, data['id'])
    print(s_bold +'TITLE - '+ e_bold, data['title'])
    print(s_bold + 'CONTEXT - '+ e_bold,data['context'])
    print(s_bold + 'QUESTION - '+ e_bold,data['question'])
    print(s_bold + 'ANSWERS - ' + e_bold,data['answers']['text'])
    print(s_bold + 'ANSWERS START INDEX - ' + e_bold,data['answers']['answer_start'])
    print(' ')
    break

[1mTrain Data Sample.....[0;0m
 
[1mID -[0;0m 5733be284776f41900661182
[1mTITLE - [0;0m University_of_Notre_Dame
[1mCONTEXT - [0;0m Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
[1mQUESTION - [0;0m To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
[1mANSWERS - [0;0m ['Saint Bernadette Soubirous']
[1mANSWERS START 

# Data Exploration

In [6]:
dataset["train"].filter(lambda x: len(x["answers"]["text"]) != 1)

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 0
})

In [7]:
dataset["validation"].filter(lambda x: len(x["answers"]["text"]) != 1)

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10567
})

In [8]:
## Lets sample some dataset so that we can reduce training time.
dataset["train"] = dataset["train"].select([i for i in range(10000)])
dataset["validation"] = dataset["validation"].select([i for i in range(2000)])
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 2000
    })
})

## Data pre-processing

### Handling long contexts

In [9]:
context = dataset["train"][0]["context"]
question = dataset["train"][0]["question"]
answer = dataset["train"][0]["answers"]["text"]

inputs = tokenizer(
    question,
    context,
    max_length=160,
    truncation="only_second",  # only to truncate context
    stride=70,  # no of overlapping tokens  between concecute context pieces
    return_overflowing_tokens=True,  #to let tokenizer know we want overflow tokens
    return_offsets_mapping=True,
)

print(f"This example was split into {len(inputs['input_ids'])} chunks/features.")
print(f"Here is where each comes from: {inputs['overflow_to_sample_mapping']}.")

print('Question: ',question)
print(' ')
print('Context : ',context)
print(' ')
print('Answer: ', answer)
print('--'*25)

for i, ids in enumerate(inputs["input_ids"]):
    print('Context piece', i+1)
    print(f'Context piece {i+1} has length {len(ids)} tokens.')
    print(tokenizer.decode(ids[ids.index(102):]))
    print(' ')

This example was split into 2 chunks/features.
Here is where each comes from: [0, 0].
Question:  To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
 
Context :  Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
 
Answer:  ['Saint Bernadette Soubirous']
--------------------------------------------------
Context piece 1
Context piece

### Understanding Tokenizer Attributes

Let's explore the key attributes returned by the tokenizer that are crucial for QA tasks:

In [10]:
# Let's examine one chunk/feature in detail
chunk_idx = 0

print("=" * 80)
print(f"EXAMINING CHUNK {chunk_idx}")
print("=" * 80)

# 1. Input IDs - the tokenized representation
print("\n1. INPUT IDS:")
print(f"   Token IDs: {inputs['input_ids'][chunk_idx][:20]}...")  # showing first 20
print(f"   Total tokens: {len(inputs['input_ids'][chunk_idx])}")

# Decode to see the actual text
decoded_text = tokenizer.decode(inputs['input_ids'][chunk_idx])
print(f"   Decoded text: {decoded_text[:150]}...")

# 2. Offset Mapping - character positions in original text
print("\n2. OFFSET MAPPING (Character positions):")
print(f"   Maps each token to its character position in the COMBINED question+context text")
print(f"   First 10 token offsets: {inputs['offset_mapping'][chunk_idx][:10]}")

print(f"\n   Detailed Token -> Character -> Original Text mapping:")
print(f"   (Showing tokens 0-15)")
print(f"   {'Pos':>4} | {'Token':^15} | {'Char Span':^12} | Original Text Segment")
print(f"   {'-'*4}-+-{'-'*15}-+-{'-'*12}-+-{'-'*40}")

for i in range(min(30, len(inputs['input_ids'][chunk_idx]))):
    token_id = inputs['input_ids'][chunk_idx][i]
    token = tokenizer.decode([token_id])
    char_span = inputs['offset_mapping'][chunk_idx][i]
    seq_id = inputs.sequence_ids(chunk_idx)[i]

    if seq_id == 0 or seq_id is None:
        original_text_segment = question[char_span[0]:char_span[1]]
    elif seq_id == 1:
        original_text_segment = context[char_span[0]:char_span[1]]
    else:
        pass
    
    span_str = f"({char_span[0]},{char_span[1]})"
    print(f"   {i:>4} | {token:^15} | {span_str:^12} | '{original_text_segment}'")

# 3. Overflow to Sample Mapping
print("\n3. OVERFLOW_TO_SAMPLE_MAPPING:")
print(f"   Shows which original example each chunk came from")
print(f"   All chunks: {inputs['overflow_to_sample_mapping']}")
print(f"   Chunk {chunk_idx} came from original example: {inputs['overflow_to_sample_mapping'][chunk_idx]}")

# 4. Sequence IDs - which part is question vs context
print("\n4. SEQUENCE_IDS (Question=0, Context=1, Special=None):")
sequence_ids = inputs.sequence_ids(chunk_idx)
print(f"   Sequence IDs: {sequence_ids[:30]}...")

# Find boundaries
question_tokens = sum(1 for sid in sequence_ids if sid == 0)
context_tokens = sum(1 for sid in sequence_ids if sid == 1)
special_tokens = sum(1 for sid in sequence_ids if sid is None)

print(f"   Question tokens (0): {question_tokens}")
print(f"   Context tokens (1): {context_tokens}")
print(f"   Special tokens (None): {special_tokens}")

# Show the breakdown visually with original text
print("\n   Visual breakdown with original text:")
print(f"   {'Pos':>4} | {'Type':^8} | {'Token':^15} | {'Char Span':^12}")
print(f"   {'-'*4}-+-{'-'*8}-+-{'-'*15}-+-{'-'*12}-+-{'-'*30}")

for i in range(min(25, len(sequence_ids))):
    token = tokenizer.decode([inputs['input_ids'][chunk_idx][i]])
    seq_id = sequence_ids[i]
    char_span = inputs['offset_mapping'][chunk_idx][i]
    
    if seq_id is None:
        label = "SPECIAL"
    elif seq_id == 0:
        label = "QUESTION"
    else:
        label = "CONTEXT"
    
    span_str = f"({char_span[0]},{char_span[1]})"
    print(f"   {i:>4} | {label:^8} | {token:^15} | {span_str:^12}'")

print("\n" + "=" * 80)

EXAMINING CHUNK 0

1. INPUT IDS:
   Token IDs: [101, 2000, 3183, 2106, 1996, 6261, 2984, 9382, 3711, 1999, 8517, 1999, 10223, 26371, 2605, 1029, 102, 6549, 2135, 1010]...
   Total tokens: 160
   Decoded text: [CLS] to whom did the virgin mary allegedly appear in 1858 in lourdes france? [SEP] architecturally, the school has a catholic character. atop the mai...

2. OFFSET MAPPING (Character positions):
   Maps each token to its character position in the COMBINED question+context text
   First 10 token offsets: [(0, 0), (0, 2), (3, 7), (8, 11), (12, 15), (16, 22), (23, 27), (28, 37), (38, 44), (45, 47)]

   Detailed Token -> Character -> Original Text mapping:
   (Showing tokens 0-15)
    Pos |      Token      |  Char Span   | Original Text Segment
   -----+-----------------+--------------+-----------------------------------------
      0 |      [CLS]      |    (0,0)     | ''
      1 |       to        |    (0,2)     | 'To'
      2 |      whom       |    (3,7)     | 'whom'
      3 |      

In [11]:
# Visualizing how stride and max_length create overlapping chunks
print("UNDERSTANDING STRIDE AND OVERLAPPING CHUNKS")
print("=" * 80)
print(f"max_length: {160}")
print(f"stride: {70} tokens")
print(f"Number of chunks created: {len(inputs['input_ids'])}\n")

# Show the overlap between consecutive chunks using offset_mapping
if len(inputs['input_ids']) > 1:
    chunk1_tokens = inputs['input_ids'][0]
    chunk2_tokens = inputs['input_ids'][1]
    
    # Get offset mappings and sequence IDs for both chunks
    chunk1_offsets = inputs['offset_mapping'][0]
    chunk2_offsets = inputs['offset_mapping'][1]
    chunk1_seq_ids = inputs.sequence_ids(0)
    chunk2_seq_ids = inputs.sequence_ids(1)
    
    # Extract context tokens (sequence_id == 1) with their offsets
    chunk1_context = []
    for idx, (token_id, offset, seq_id) in enumerate(zip(chunk1_tokens, chunk1_offsets, chunk1_seq_ids)):
        if seq_id == 1 and token_id != 0:  # Context tokens, not padding
            chunk1_context.append({
                'token_idx': idx,
                'token_id': token_id,
                'char_start': offset[0],
                'char_end': offset[1],
                'token_text': tokenizer.decode([token_id])
            })
    
    chunk2_context = []
    for idx, (token_id, offset, seq_id) in enumerate(zip(chunk2_tokens, chunk2_offsets, chunk2_seq_ids)):
        if seq_id == 1 and token_id != 0:  # Context tokens, not padding
            chunk2_context.append({
                'token_idx': idx,
                'token_id': token_id,
                'char_start': offset[0],
                'char_end': offset[1],
                'token_text': tokenizer.decode([token_id])
            })
    
    print(f"CHUNK 1 - Context has {len(chunk1_context)} tokens")
    print(f"CHUNK 2 - Context has {len(chunk2_context)} tokens\n")
    
    # Find overlapping tokens by comparing character offsets
    # Tokens overlap if they reference the same character positions in the original text
    overlapping_tokens = []
    for token1 in chunk1_context:
        for token2 in chunk2_context:
            # Check if tokens overlap based on character positions
            if (token1['char_start'] == token2['char_start'] and 
                token1['char_end'] == token2['char_end']):
                overlapping_tokens.append({
                    'chunk1_idx': token1['token_idx'],
                    'chunk2_idx': token2['token_idx'],
                    'token_id': token1['token_id'],
                    'token_text': token1['token_text'],
                    'char_pos': f"{token1['char_start']}-{token1['char_end']}"
                })
                break
    
    print(f"ACTUAL OVERLAPPING TOKENS: {len(overlapping_tokens)} tokens\n")
    
    if overlapping_tokens:
        # Display overlapping tokens
        display_count = min(20, len(overlapping_tokens))
        
        print(f"Showing {display_count} of {len(overlapping_tokens)} overlapping tokens:")
        print(f"{'Chunk1 Pos':^12} | {'Chunk2 Pos':^12} | {'Token ID':^10} | {'Char Pos':^12} | Token Text")
        print("-" * 80)
        
        for i, token_info in enumerate(overlapping_tokens[:display_count]):
            print(f"{token_info['chunk1_idx']:^12} | {token_info['chunk2_idx']:^12} | "
                  f"{token_info['token_id']:^10} | {token_info['char_pos']:^12} | '{token_info['token_text']}'")
        
        if len(overlapping_tokens) > display_count:
            print(f"... and {len(overlapping_tokens) - display_count} more overlapping tokens")
        
        # Show the decoded text of overlapping portion
        print("\n" + "="*80)
        print("DECODED OVERLAPPING TEXT:")
        overlap_token_ids = [t['token_id'] for t in overlapping_tokens]
        overlap_text = tokenizer.decode(overlap_token_ids)
        print(f"{overlap_text}")
        
        print("\nðŸ‘† These tokens appear in BOTH chunks at the same character positions!")
        print("   This sliding window approach ensures answers aren't split across chunks.\n")
    else:
        print("No overlapping tokens found. This might happen if context is very short.\n")

# Demonstrate why offset_mapping is crucial for finding answers
print("\nWHY OFFSET_MAPPING MATTERS FOR QA:")
print("-" * 80)
print("Original answer:", answer)
print(f"Answer starts at character: {dataset['train'][0]['answers']['answer_start'][0]}")

# Find which tokens contain the answer using offset_mapping
answer_start_char = dataset['train'][0]['answers']['answer_start'][0]
answer_end_char = answer_start_char + len(answer[0])

print(f"Answer ends at character: {answer_end_char}")
print(f"Answer span: [{answer_start_char}, {answer_end_char})")
print(f"\nUsing offset_mapping, we can find which TOKENS contain the answer:")

for chunk_idx in range(len(inputs['input_ids'])):
    offsets = inputs['offset_mapping'][chunk_idx]
    sequence_ids = inputs.sequence_ids(chunk_idx)
    
    found_answer_tokens = []
    # Check if answer is in this chunk
    for token_idx, (start_char, end_char) in enumerate(offsets):
        # Skip question tokens and special tokens
        if sequence_ids[token_idx] != 1:
            continue
        
        # CORRECT condition: token overlaps with answer if:
        # token_start < answer_end AND token_end > answer_start
        # This captures ALL tokens that have any overlap with the answer span
        if start_char < answer_end_char and end_char > answer_start_char:
            token = tokenizer.decode([inputs['input_ids'][chunk_idx][token_idx]])
            # Also extract the actual text from context using offset
            token_text_from_context = context[start_char:end_char]
            found_answer_tokens.append({
                'idx': token_idx,
                'token': token,
                'span': (start_char, end_char),
                'context_text': token_text_from_context
            })
    print(found_answer_tokens)
    if found_answer_tokens:
        print(f"\n  Chunk {chunk_idx} - Found {len(found_answer_tokens)} answer tokens:")
        for token_info in found_answer_tokens:
            print(f"    Token {token_info['idx']}: '{token_info['token']}' "
                  f"(chars {token_info['span'][0]}-{token_info['span'][1]}) "
                  f"-> Context: '{token_info['context_text']}'")
        
        # Reconstruct the full answer from tokens
        answer_token_ids = [inputs['input_ids'][chunk_idx][t['idx']] for t in found_answer_tokens]
        reconstructed_answer = tokenizer.decode(answer_token_ids)
        print(f"    Reconstructed answer: '{reconstructed_answer}'")
        print(f"    Original answer:      '{answer[0]}'")
        print(f"    Match: {reconstructed_answer.strip() == answer[0].strip().lower()}")
            
print("\n" + "=" * 80)
print("\nKEY INSIGHT:")
print("The condition 'start_char < answer_end_char AND end_char > answer_start_char'")
print("ensures we capture ALL tokens overlapping with the answer, not just the first")
print("and last tokens. This is crucial for multi-token answers!")
print("=" * 80)

UNDERSTANDING STRIDE AND OVERLAPPING CHUNKS
max_length: 160
stride: 70 tokens
Number of chunks created: 2

CHUNK 1 - Context has 142 tokens
CHUNK 2 - Context has 86 tokens

ACTUAL OVERLAPPING TOKENS: 70 tokens

Showing 20 of 70 overlapping tokens:
 Chunk1 Pos  |  Chunk2 Pos  |  Token ID  |   Char Pos   | Token Text
--------------------------------------------------------------------------------
     89      |      17      |    6730    |   320-326    | 'sacred'
     90      |      18      |    2540    |   327-332    | 'heart'
     91      |      19      |    1012    |   332-333    | '.'
     92      |      20      |    3202    |   334-345    | 'immediately'
     93      |      21      |    2369    |   346-352    | 'behind'
     94      |      22      |    1996    |   353-356    | 'the'
     95      |      23      |   13546    |   357-365    | 'basilica'
     96      |      24      |    2003    |   366-368    | 'is'
     97      |      25      |    1996    |   369-372    | 'the'
     98 

## Data Preparation

### Process the training data

In [12]:
def train_data_preprocess(examples):
    """
    generate start and end indexes of answer in context
    """
    
    def find_context_start_end_index(sequence_ids):
        """
        returns the token index in which context starts and ends
        """
        token_idx = 0
        while sequence_ids[token_idx] != 1:  # means its special tokens or tokens of queston
            token_idx += 1                   # loop only breask when context starts in tokens
        context_start_idx = token_idx
    
        while sequence_ids[token_idx] == 1:
            token_idx += 1
        context_end_idx = token_idx - 1
        return context_start_idx, context_end_idx  
    
    
    questions = [q.strip() for q in examples["question"]]
    context = examples["context"]
    answers = examples["answers"]
    
    inputs = tokenizer(
        questions,
        context,
        max_length=512,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,  #returns id of base context
        return_offsets_mapping=True,  # returns (start_index,end_index) of each token
        padding="max_length"
    )

    start_positions = []
    end_positions = []
    
    for i, mapping_idx_pairs in enumerate(inputs['offset_mapping']):
        context_idx = inputs['overflow_to_sample_mapping'][i]
    
        # from main context
        answer = answers[context_idx]
        answer_start_char_idx = answer['answer_start'][0]
        # print("answer start char idx:", answer_start_char_idx)
        answer_end_char_idx = answer_start_char_idx + len(answer['text'][0])
        # print("answer end char idx:", answer_end_char_idx)

        # now we have to find it in sub contexts
        tokens = inputs['input_ids'][i]
        sequence_ids = inputs.sequence_ids(i)
        # print("sequence ids:", sequence_ids)
   
        # finding the context start and end indexes wrt sub context tokens
        context_start_idx, context_end_idx = find_context_start_end_index(sequence_ids)
        # print("context start idx:", context_start_idx)
        # print("context end idx:", context_end_idx)
    
        # if the answer is not fully inside context label it as (0,0)
        # starting and end index of charecter of full context text
        context_start_char_index = mapping_idx_pairs[context_start_idx][0]
        # print("context start char index:", context_start_char_index)
        context_end_char_index = mapping_idx_pairs[context_end_idx][1]
        # print("context end char index:", context_end_char_index)
    
        # If the answer is not fully inside the context, label is (0, 0)
        if (context_start_char_index > answer_start_char_idx) or (
            context_end_char_index < answer_end_char_idx):
            start_positions.append(0)
            end_positions.append(0)
    
        else:
            # else its start and end token positions
            # here idx indicates index of token
            idx = context_start_idx
            # print("idx:", idx)
            while idx <= context_end_idx and mapping_idx_pairs[idx][0] <= answer_start_char_idx:
                idx += 1
            # print("final idx for start position:", idx)
            start_positions.append(idx - 1)  
        
            idx = context_end_idx
            while idx >= context_start_idx and mapping_idx_pairs[idx][1] > answer_end_char_idx:
                idx -= 1
            # print("final idx for end position:", idx)
            end_positions.append(idx + 1)
    
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs
    
train_sample = dataset["train"].select([i for i in range(100)])

train_dataset = train_sample.map(
    train_data_preprocess,
    batched=True,
    remove_columns=dataset["train"].column_names
)

len(dataset["train"]), len(train_dataset)

(10000, 100)

In [13]:
train_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping', 'start_positions', 'end_positions'],
    num_rows: 100
})

### Peek into the processed dataset

In [14]:
def print_context_and_answer(idx):
    
    # original data
    print(idx)
    print('----')
    question = dataset["train"][idx]['question']
    context = dataset["train"][idx]['context']
    answer = dataset["train"][idx]['answers']['text']
    print('Theoretical values :')
    print(' ')
    print('Question: ')
    print(question)
    print(' ')
    print('Context: ')
    print(context)
    print(' ')
    print('Answer: ')
    print(answer)
    print(' ')
    answer_start_char_idx = dataset["train"][idx]['answers']['answer_start'][0]
    answer_end_char_idx = answer_start_char_idx + len(dataset["train"][idx]['answers']['text'][0])
    print('Start and end index of text: ',answer_start_char_idx,answer_end_char_idx)
    print('----'*20)

    # Mapped Data
    print('Values after tokenization:')
    sep_tok_index = train_dataset[idx]['input_ids'].index(102) #get index for [SEP]
    question_ = train_dataset[idx]['input_ids'][:sep_tok_index+1]
    question_decoded = tokenizer.decode(question_) 
    context_ = train_dataset[idx]['input_ids'][sep_tok_index+1:]
    context_decoded = tokenizer.decode(context_) 
    start_idx = train_dataset[idx]['start_positions']
    end_idx = train_dataset[idx]['end_positions']
    answer_toks = train_dataset[idx]['input_ids'][start_idx:end_idx]
    answer_decoded = tokenizer.decode(answer_toks)
    print(' ')
    print('Question: ')
    print(question_decoded)
    print(' ')
    print('Context: ')
    print(context_decoded)
    print(' ')
    print('Answer: ')
    print(answer_decoded)
    print(' ')
    print('Start pos and end pos of tokens: ',train_dataset[idx]['start_positions'],train_dataset[idx]['end_positions'])
    print('____'*20)
    
    
print_context_and_answer(0)
print_context_and_answer(1)
print_context_and_answer(2)
print_context_and_answer(3)

0
----
Theoretical values :
 
Question: 
To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
 
Context: 
Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.
 
Answer: 
['Saint Bernadette Soubirous']
 
Start and end index of text:  515 541
--------------------------------------------------------------------------------
Values after tok

### Process the validation data

In [15]:
def preprocess_validation_examples(examples):
    """
    preprocessing validation data
    """
    questions = [q.strip() for q in examples["question"]]
    context = examples["context"]

    inputs = tokenizer(
        questions,
        context,
        max_length=512,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")

    base_ids = []

    for i in range(len(inputs["input_ids"])):
        
        # take the base id (ie in cases of overflow happens we get base id)
        base_context_idx = sample_map[i]
        base_ids.append(examples["id"][base_context_idx])
        
        # sequence id indicates the input. 0 for first input and 1 for second input
        # and None for special tokens by default
        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        
        # for Question tokens provide offset_mapping as None
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["base_id"] = base_ids
    return inputs


val_sample = dataset["validation"].select([i for i in range(100)])

eval_set = val_sample.map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=dataset["validation"].column_names,
)
len(eval_set)

100

In [16]:
eval_set

Dataset({
    features: ['input_ids', 'attention_mask', 'offset_mapping', 'base_id'],
    num_rows: 100
})

# Model Performance on eval set

In [17]:
eval_set_for_model = eval_set.remove_columns(["base_id", "offset_mapping"])
print("column names of eval set for model: ", eval_set_for_model.column_names)
eval_set_for_model.set_format("torch")

# Method 1: Stack all examples into a single batch (works for small datasets)
# Need to stack the list of tensors into a single batched tensor
batch = {k: torch.stack([eval_set_for_model[i][k] for i in range(len(eval_set_for_model))]) 
         for k in eval_set_for_model.column_names}

# Move batch to device
batch = {k: v.to(device) for k, v in batch.items()}

print("Batch shapes:")
for k, v in batch.items():
    print(f"  {k}: {v.shape}")

with torch.no_grad():
    outputs = model(**batch)
    
start_logits = outputs.start_logits.cpu().numpy()
end_logits = outputs.end_logits.cpu().numpy()

start_logits.shape, end_logits.shape

column names of eval set for model:  ['input_ids', 'attention_mask']
Batch shapes:
  input_ids: torch.Size([100, 512])
  attention_mask: torch.Size([100, 512])


((100, 512), (100, 512))

## Calculate metrics for un-tuned model

In [18]:
import collections

def predict_answers_and_evaluate(start_logits, end_logits, eval_set, examples):
    """
    make predictions 
    Args:
    start_logits : strat_position prediction logits
    end_logits: end_position prediction logits
    eval_set: processed val data
    examples: unprocessed val data with context text
    """
    # appending all id's corresponding to the base context id
    example_to_features = collections.defaultdict(list)

    for idx, feature in enumerate(eval_set):
        example_to_features[feature["base_id"]].append(idx)

    n_best = 20
    max_answer_length = 30
    predicted_answers = []

    for example in examples:
        example_id = example["id"]
        context = example["context"]
        answers = []

        # looping through each sub contexts corresponding to a context and finding
        # answers
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            
            offsets = eval_set["offset_mapping"][feature_index]
        
            # sorting the predictions of all hidden states and taking best n_best prediction
            # means taking the index of top 20 tokens
            start_indexes = np.argsort(start_logit).tolist()[::-1][:n_best]
            end_indexes = np.argsort(end_logit).tolist()[::-1][:n_best]
        
            for start_index in start_indexes:
                for end_index in end_indexes:
                
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length.
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                       ):
                        continue

                    answers.append({
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                        })


    
        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    metric = evaluate.load("squad")

    theoretical_answers = [
            {"id": ex["id"], "answers": ex["answers"]} for ex in examples
    ]
    
    metric_ = metric.compute(predictions=predicted_answers, references=theoretical_answers)
    return predicted_answers, metric_

pred_answers,metrics_ = predict_answers_and_evaluate(start_logits, end_logits, eval_set, val_sample)
metrics_

{'exact_match': 0.0, 'f1': 4.876809727267394}

# Training a Question Answering System based on BERT

## Preparing data for training

In [19]:
# lets sample a small dataset
dataset['train'] = dataset['train'].select([i for i in range(10000)])
dataset['validation'] = dataset['validation'].select([i for i in range(1000)])

dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 1000
    })
})

In [20]:
from torch.utils.data import DataLoader, Dataset

class DataQA(Dataset):
    def __init__(self, dataset, mode="train"):
        self.mode = mode
        
        if self.mode == "train":
            # sampling
            self.dataset = dataset["train"]
            self.data = self.dataset.map(train_data_preprocess,
                                        batched=True,
                                        remove_columns= dataset["train"].column_names
                                    )
        
        else:
            self.dataset = dataset["validation"]
            self.data = self.dataset.map(preprocess_validation_examples,
                                    batched=True,
                                    remove_columns = dataset["validation"].column_names     
                                 )
            
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):

        out = {}
        example = self.data[idx]
        out['input_ids'] = torch.tensor(example['input_ids'])
        out['attention_mask'] = torch.tensor(example['attention_mask'])
        
        if self.mode == "train":
            out['start_positions'] = torch.unsqueeze(torch.tensor(example['start_positions']), dim=0)
            out['end_positions'] = torch.unsqueeze(torch.tensor(example['end_positions']), dim=0)
            
        return out

In [21]:
train_dataset = DataQA(dataset, mode="train")
val_dataset = DataQA(dataset, mode="validation")

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [22]:
print("Train Dataset Sample Shapes:\n")
for i, d in enumerate(train_dataset):
    for k in d.keys():
        print(k + ' : ', d[k].shape)
    print('--'*40)

    if i == 2:
        break
        
print('__'*50)

print("Validation Dataset Sample Lengths:\n")
for i,d in enumerate(val_dataset):
    for k in d.keys():
        print(k + ' : ', d[k].shape)
    print('--'*40)
    
    if i == 2:
        break

Train Dataset Sample Shapes:

input_ids :  torch.Size([512])
attention_mask :  torch.Size([512])
start_positions :  torch.Size([1])
end_positions :  torch.Size([1])
--------------------------------------------------------------------------------
input_ids :  torch.Size([512])
attention_mask :  torch.Size([512])
start_positions :  torch.Size([1])
end_positions :  torch.Size([1])
--------------------------------------------------------------------------------
input_ids :  torch.Size([512])
attention_mask :  torch.Size([512])
start_positions :  torch.Size([1])
end_positions :  torch.Size([1])
--------------------------------------------------------------------------------
____________________________________________________________________________________________________
Validation Dataset Sample Lengths:

input_ids :  torch.Size([512])
attention_mask :  torch.Size([512])
--------------------------------------------------------------------------------
input_ids :  torch.Size([512])
attent

In [23]:
from transformers import default_data_collator

train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=2,
)
eval_dataloader = DataLoader(
    val_dataset, 
    collate_fn=default_data_collator, 
    batch_size=2
)

for batch in train_dataloader:
   print(batch['input_ids'].shape)
   print(batch['attention_mask'].shape)
   print(batch['start_positions'].shape)
   print(batch['end_positions'].shape)
   break

print('---'*20)

for batch in eval_dataloader:
   print(batch['input_ids'].shape)
   print(batch['attention_mask'].shape)
   break

torch.Size([2, 512])
torch.Size([2, 512])
torch.Size([2, 1])
torch.Size([2, 1])
------------------------------------------------------------
torch.Size([2, 512])
torch.Size([2, 512])


## Define Model

In [24]:
from torch.optim import AdamW
from tqdm.notebook import tqdm
import datetime

optimizer = AdamW(model.parameters(), lr=2e-5)

epochs = 4

# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs
print(total_steps)

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

20020


In [25]:
# we need processed validation data to get offsets at the time of evaluation
validation_processed_dataset = dataset["validation"].map(
        preprocess_validation_examples,
        batched=True,
        remove_columns = dataset["validation"].column_names,
    )

In [26]:
import random,time

# to reproduce results
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
# torch.cuda.manual_seed_all(seed_val)

#storing all training and validation stats
stats = []

#to measure total training time
total_train_time_start = time.time()

for epoch in range(epochs):
    print(' ')
    print(f'=====Epoch {epoch + 1}=====')
    print('Training....')
     
    # ===============================
    #    Train
    # ===============================   
    # measure how long training epoch takes
    t0 = time.time()
     
    training_loss = 0
    plot_loss = 0
    # loop through train data
    model.train()
    for step, batch in enumerate(train_dataloader):
         
        # we will print train time in aftere each 1000 batch
        if step%1000 == 0 and not step == 0:
              elapsed_time = format_time(time.time() - t0)
              # Report progress.
              print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed_time))

         
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
            
        #set gradients to zero
        model.zero_grad()

        result = model(input_ids = input_ids, 
                        attention_mask = attention_mask,
                        start_positions = start_positions,
                        end_positions = end_positions,
                        return_dict=True)
         
        loss = result.loss
    
        # accumulate the loss over batches so that we can calculate avg loss at the end
        training_loss += loss.item()
        plot_loss += loss.item()
        
        # We will print train loss after 1000 batches
        if step%1000 == 0 and not step == 0:
            avg_loss = plot_loss/1000
            print(f'  Batch {step}/{len(train_dataloader)} - Loss: {avg_loss:.4f}')
            plot_loss = 0

        # perform backward propagation
        loss.backward()

        # update the gradients
        optimizer.step()

    # calculate avg loss
    avg_train_loss = training_loss/len(train_dataloader) 
 
    # calculates training time
    training_time = format_time(time.time() - t0)
     
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(training_time))
    
    # ===============================
    #    Validation
    # ===============================
     
    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    start_logits, end_logits = [],[]
    for step, batch in enumerate(eval_dataloader):
    
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():  
             result = model(
                    input_ids = input_ids, 
                    attention_mask = attention_mask,
                    return_dict=True
                    )
        
        start_logits.append(result.start_logits.cpu().numpy())
        end_logits.append(result.end_logits.cpu().numpy())
   
    start_logits = np.concatenate(start_logits)
    end_logits = np.concatenate(end_logits)

    # calculating metrics
    answers,metrics_ = predict_answers_and_evaluate(start_logits, end_logits, validation_processed_dataset, dataset["validation"])
    print(f'Exact match: {metrics_["exact_match"]}, F1 score: {metrics_["f1"]}')

    print('')
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)

    print("  Validation took: {:}".format(validation_time))

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_train_time_start)))

 
=====Epoch 1=====
Training....
  Batch 1,000  of  5,005.    Elapsed: 0:31:22.
  Batch 1000/5005 - Loss: 3.0481
  Batch 2,000  of  5,005.    Elapsed: 1:00:30.
  Batch 2000/5005 - Loss: 2.1065
  Batch 3,000  of  5,005.    Elapsed: 1:33:02.
  Batch 3000/5005 - Loss: 1.9082
  Batch 4,000  of  5,005.    Elapsed: 2:02:39.
  Batch 4000/5005 - Loss: 1.6742
  Batch 5,000  of  5,005.    Elapsed: 2:33:53.
  Batch 5000/5005 - Loss: 1.5530

  Average training loss: 2.06
  Training epoch took: 2:34:01

Running Validation...
Exact match: 26.7, F1 score: 62.92895154304274

  Validation took: 0:03:14
 
=====Epoch 2=====
Training....
  Batch 1,000  of  5,005.    Elapsed: 0:29:22.
  Batch 1000/5005 - Loss: 1.0126
  Batch 2,000  of  5,005.    Elapsed: 0:58:31.
  Batch 2000/5005 - Loss: 1.0061
  Batch 3,000  of  5,005.    Elapsed: 1:43:54.
  Batch 3000/5005 - Loss: 0.9847
  Batch 4,000  of  5,005.    Elapsed: 2:24:42.
  Batch 4000/5005 - Loss: 0.9739
  Batch 5,000  of  5,005.    Elapsed: 2:54:49.
  Batch

# Inference

## Method 1: Produce multilple answers and select best based on logits, handle large contexts too

In [27]:
def answer_question_long_sequence(question, answer_text):
    '''
    Takes a `question` string and an `answer_text` string (which contains the
    answer), and identifies the words within the `answer_text` that are the
    answer. Prints them out.
    
    Handles both short and long contexts by processing multiple chunks if needed.
    '''
    # ======== Tokenize ========
    # Apply the tokenizer to the input text with overflow handling
    encoded_dict = tokenizer(
                    question, 
                    answer_text,
                    max_length=512,
                    padding="max_length",
                    truncation="only_second",
                    stride=128,
                    return_overflowing_tokens=True,
                    return_offsets_mapping=True, 
                )
    
    # Get all chunks
    input_ids_list = encoded_dict['input_ids']
    attention_mask_list = encoded_dict['attention_mask']
    offset_mapping_list = encoded_dict['offset_mapping']
    
    print(f'Context split into {len(input_ids_list)} chunk(s)\n')
    
    # ======== Process Each Chunk ========
    best_answer = None
    best_score = float('-inf')
    
    for chunk_idx in range(len(input_ids_list)):
        input_ids = input_ids_list[chunk_idx]
        attention_mask = attention_mask_list[chunk_idx]
        offsets = offset_mapping_list[chunk_idx]
        
        # Get sequence IDs to identify context tokens
        sequence_ids = encoded_dict.sequence_ids(chunk_idx)
        
        # Run model on this chunk
        with torch.no_grad():
            output = model(
                torch.tensor([input_ids]).to(device),
                attention_mask=torch.tensor([attention_mask]).to(device)
            )
        
        start_logits = output.start_logits[0].cpu().numpy()
        end_logits = output.end_logits[0].cpu().numpy()
        
        # Find top N predictions for this chunk
        n_best = 5
        max_answer_length = 30
        
        start_indexes = np.argsort(start_logits).tolist()[::-1][:n_best]
        end_indexes = np.argsort(end_logits).tolist()[::-1][:n_best]
        
        # Try all combinations of start and end positions
        for start_idx in start_indexes:
            for end_idx in end_indexes:
                # Skip if not in context (sequence_id should be 1 for context)
                if sequence_ids[start_idx] != 1 or sequence_ids[end_idx] != 1:
                    continue
                    
                # Skip invalid spans
                if end_idx < start_idx or end_idx - start_idx + 1 > max_answer_length:
                    continue
                
                # Calculate score
                score = start_logits[start_idx] + end_logits[end_idx]
                
                # Extract answer text using offsets
                start_char = offsets[start_idx][0]
                end_char = offsets[end_idx][1]
                answer_candidate = answer_text[start_char:end_char]
                
                # Update best answer if this is better
                if score > best_score:
                    best_score = score
                    best_answer = {
                        'text': answer_candidate,
                        'score': score,
                        'chunk': chunk_idx,
                        'start_idx': start_idx,
                        'end_idx': end_idx
                    }
    
    # ======== Display Result ========
    if best_answer:
        print(f'Best Answer: "{best_answer["text"]}"')
        print(f'Confidence Score: {best_answer["score"]:.4f}')
        print(f'Found in chunk: {best_answer["chunk"] + 1}/{len(input_ids_list)}')
    else:
        print('No valid answer found.')

## Method 2: Simple answer generation

In [28]:
def answer_question(question, answer_text):
    '''
    Takes a `question` string and an `answer_text` string (which contains the
    answer), and identifies the words within the `answer_text` that are the
    answer. Prints them out.
    '''
    # ======== Tokenize ========
    # Apply the tokenizer to the input text, treating them as a text-pair.
    encoded_dict = tokenizer(
                    question, 
                    answer_text,      # Sentence to encode.
                    add_special_tokens = True,  # Add '[CLS]' and '[SEP]'
                    max_length=512,
                    padding="max_length",
                )
    input_ids = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask']

    assert len(attention_mask) == len(input_ids)

    # Report how long the input sequence is.
    print('Query has {:,} tokens.\n'.format(len(input_ids)))

    # ======== Evaluate ========
    # Run our example question through the model.
    output = model(torch.tensor([input_ids]), # The tokens representing our input text.
                                    attention_mask=torch.tensor([attention_mask])) # The attention mask to differentiate question from answer_text
    
    start_scores = output.start_logits
    end_scores = output.end_logits

    # ======== Reconstruct Answer ========
    # Find the tokens with the highest `start` and `end` scores.
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)

    # Get the string versions of the input tokens.
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # Start with the first token.
    answer = tokens[answer_start]

    # Select the remaining answer tokens and join them with whitespace.
    for i in range(answer_start + 1, answer_end + 1):
        
        # If it's a subword token, then recombine it with the previous token.
        if tokens[i][0:2] == '##':
            answer += tokens[i][2:]
        
        # Otherwise, add a space then the token.
        else:
            answer += ' ' + tokens[i]

    print('Simple answer generation : "' + answer + '"')

In [29]:
question = "How many parameters does BERT-large have?"
bert_text = "BERT-large is really big... it has 24-layers and an embedding size of 1,024, for a total of 340M parameters! Altogether it is 1.34GB,so expect it to take a couple minutes to download to your Colab instance."

print("Question: ", question)
answer_question(question, bert_text)
answer_question_long_sequence(question, bert_text)


Question:  How many parameters does BERT-large have?
Query has 512 tokens.

Simple answer generation : "340m parameters"
Context split into 1 chunk(s)

Best Answer: "340M parameters"
Confidence Score: 18.5874
Found in chunk: 1/1


In [30]:
question = "What are some example applications of BERT?"

print("Question: ", question)
answer_question(question, bert_text)
answer_question_long_sequence(question, bert_text)

Question:  What are some example applications of BERT?
Query has 512 tokens.

Simple answer generation : "cola"
Context split into 1 chunk(s)

Best Answer: "it has 24-layers and an embedding size of 1,024, for a total of 340M parameters! Altogether"
Confidence Score: -6.6550
Found in chunk: 1/1


In [31]:
question = "What does the 'B' in BERT stand for?"

print("Question: ", question)
answer_question(question, bert_text)
answer_question_long_sequence(question, bert_text)

Question:  What does the 'B' in BERT stand for?
Query has 512 tokens.

Simple answer generation : "it"
Context split into 1 chunk(s)

Best Answer: "large is"
Confidence Score: -3.7095
Found in chunk: 1/1


In [32]:
question = "What is my name?"
context = "My name is Tushar. My job is of data scientist"

print("Question: ", question)
answer_question(question, context)
answer_question_long_sequence(question, context)

Question:  What is my name?
Query has 512 tokens.

Simple answer generation : "tushar ."
Context split into 1 chunk(s)

Best Answer: "Tushar."
Confidence Score: 13.8101
Found in chunk: 1/1


In [33]:
question = "What is my job?"

print("Question: ", question)
answer_question(question, context)
answer_question_long_sequence(question, context)

Question:  What is my job?
Query has 512 tokens.

Simple answer generation : "data"
Context split into 1 chunk(s)

Best Answer: "Tushar."
Confidence Score: 6.2145
Found in chunk: 1/1


In [34]:
question = "Which NFL team represented the AFC at Super Bowl 50?"
answer_text = """Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) \
            champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24â€“10 to earn their third Super Bowl title. The game was played on February 7, \
            2016, at Levi's Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various \
            gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as \
            "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50."""

print("Question: ", question)
answer_question(question, answer_text)
answer_question_long_sequence(question, answer_text)

Question:  Which NFL team represented the AFC at Super Bowl 50?
Query has 512 tokens.

Simple answer generation : "denver broncos defeated"
Context split into 1 chunk(s)

Best Answer: "Denver Broncos defeated"
Confidence Score: 18.4528
Found in chunk: 1/1


In [35]:
question = "To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?"
 
context = """Architecturally, the school has a Catholic character. Atop the Main Building's gold dome is a golden statue of the Virgin Mary. \
    Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". \
        Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. \
            It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. \
                At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary."""

print("Question: ", question)
answer_question(question, context)
answer_question_long_sequence(question, context)


Question:  To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Query has 512 tokens.

Simple answer generation : "saint bernadette soubirous in"
Context split into 1 chunk(s)

Best Answer: "Saint Bernadette Soubirous in"
Confidence Score: 20.3800
Found in chunk: 1/1


In [36]:
questions = 'What is in front of the Notre Dame Main Building?'

print("Question: ", question)
answer_question(question, context)
answer_question_long_sequence(question, context)

Question:  To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?
Query has 512 tokens.

Simple answer generation : "saint bernadette soubirous in"
Context split into 1 chunk(s)

Best Answer: "Saint Bernadette Soubirous in"
Confidence Score: 20.3800
Found in chunk: 1/1
