In [16]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

In [17]:
tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")

In [18]:
def create_noise_text():
    noise_sentences = [
        "The company policy states that employees must arrive on time.",
        "Weather conditions today are partly cloudy with mild temperatures.", 
        "The quarterly meeting will be held next Tuesday in conference room B.",
        "Please remember to submit your timesheets before Friday.",
        "The cafeteria menu has been updated with new vegetarian options.",
        "IT maintenance is scheduled for this weekend from 2-4 AM.",
        "New parking regulations will be enforced starting next month.",
        "The annual company picnic has been moved to August 15th.",
    ]
    return " ".join(noise_sentences * 20)  # Repeat for length

question = "what is the fine?"
context_part1 = create_noise_text()
context_actual = "the fine is 100 euros."
context_part2 = create_noise_text()

context_final = context_part1 + " " + context_actual + " " + context_part2
print(context_final)

The company policy states that employees must arrive on time. Weather conditions today are partly cloudy with mild temperatures. The quarterly meeting will be held next Tuesday in conference room B. Please remember to submit your timesheets before Friday. The cafeteria menu has been updated with new vegetarian options. IT maintenance is scheduled for this weekend from 2-4 AM. New parking regulations will be enforced starting next month. The annual company picnic has been moved to August 15th. The company policy states that employees must arrive on time. Weather conditions today are partly cloudy with mild temperatures. The quarterly meeting will be held next Tuesday in conference room B. Please remember to submit your timesheets before Friday. The cafeteria menu has been updated with new vegetarian options. IT maintenance is scheduled for this weekend from 2-4 AM. New parking regulations will be enforced starting next month. The annual company picnic has been moved to August 15th. The 

In [19]:
# Enable padding and tensors for batch processing
inputs = tokenizer(
    question,
    context_final,
    max_length=512,  
    truncation="only_second", 
    stride=100,      
    return_overflowing_tokens=True, 
    return_offsets_mapping=True,
    padding=True,  # Makes all chunks same length
    return_tensors="pt"  # Now works with padding
)

print(type(inputs))
print(f"Number of chunks created: {len(inputs['input_ids'])}")
print(f"Chunk lengths: {[len(chunk) for chunk in inputs['input_ids']]}")
print(inputs['input_ids'])

<class 'transformers.tokenization_utils_base.BatchEncoding'>
Number of chunks created: 9
Chunk lengths: [512, 512, 512, 512, 512, 512, 512, 512, 512]
tensor([[    0, 12196,    16,  ..., 28538,  5765,     2],
        [    0, 12196,    16,  ...,  1410,     7,     2],
        [    0, 12196,    16,  ...,     4,  3401,     2],
        ...,
        [    0, 12196,    16,  ...,   138,   714,     2],
        [    0, 12196,    16,  ...,   700,  2580,     2],
        [    0, 12196,    16,  ...,     1,     1,     1]])


# Understanding the Core Concepts

## The Problem We're Solving

When you have a long document and a question, the QA model has a **token limit** (512 tokens for RoBERTa). Your document is longer than this limit, so the tokenizer automatically creates **multiple chunks**:

```python
inputs = tokenizer(question, context_final, return_overflowing_tokens=True, max_length=512, stride=100)
```

**What happens here conceptually:**
- The tokenizer takes your question + long context
- It creates the first chunk: question + first 512 tokens of context  
- It creates the second chunk: question + next 412 tokens (512-100) + 100 overlapping tokens from chunk 1
- This continues until the entire document is covered

## The Stride Concept

The `stride=100` is crucial for understanding. Without overlap, an answer might be **split across two chunks**:

```
Chunk 1: "...the fine is 1"  
Chunk 2: "00 euros..."
```

With stride=100, both chunks contain the complete answer:
```
Chunk 1: "...the fine is 100 euros..."  
Chunk 2: "...the fine is 100 euros..." (overlapped)
```

In [20]:
# Batch processing with global argmax approach

# Load the QA model
model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")

print(f"Input tensor shape: {inputs['input_ids'].shape}")

# Batch inference - all chunks at once!
with torch.no_grad():
    outputs = model(
        input_ids=inputs['input_ids'], 
        attention_mask=inputs['attention_mask']
    )

# Vectorized approach - NO LOOPS!

# 1. Find max start and end scores FOR EACH CHUNK
# dim=1 means: search within the 512 tokens of each chunk
chunk_max_start_scores, chunk_start_indices = torch.max(outputs.start_logits, dim=1)
chunk_max_end_scores, chunk_end_indices = torch.max(outputs.end_logits, dim=1)

# Now we have vectors of size [Num_Chunks]
# e.g., chunk_max_start_scores = [3.4, 4.5, 9.1, ...]

# 2. Combine scores to find confidence per chunk
chunk_confidences = chunk_max_start_scores + chunk_max_end_scores

# 3. Validations (Vectorized)
# End must come after Start
valid_mask = chunk_end_indices >= chunk_start_indices
# If not valid, set score to -inf so it won't win
chunk_confidences[~valid_mask] = -float('inf')

# 4. Find the Best Chunk among all chunks
best_chunk_idx = torch.argmax(chunk_confidences).item()
best_confidence = chunk_confidences[best_chunk_idx].item()

# 5. Extract Details
start_idx = chunk_start_indices[best_chunk_idx].item()
end_idx = chunk_end_indices[best_chunk_idx].item()

print(f"Winner Chunk: {best_chunk_idx}")
print(f"Start position: {start_idx}, End position: {end_idx}")
print(f"Combined confidence: {best_confidence:.2f}")

# Extract answer
answer_tokens = inputs['input_ids'][best_chunk_idx][start_idx:end_idx+1]
answer = tokenizer.decode(answer_tokens, skip_special_tokens=True)
print(f"\nAnswer: '{answer}'")

Input tensor shape: torch.Size([9, 512])
Winner Chunk: 4
Start position: 219, End position: 220
Combined confidence: 9.87

Answer: ' 100 euros'
