# Training Reader Model

In [1]:
import torch
import platform
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline, get_linear_schedule_with_warmup
from datasets import load_dataset
from tqdm import tqdm
from torch.optim import AdamW  
from evaluate import load as load_metric
from torch.cuda.amp import autocast, GradScaler
torch.backends.cudnn.benchmark = True
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
print(f"Python version: {platform.python_version()}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU model: {torch.cuda.get_device_name(0)}")
    print(f"Number of GPUs: {torch.cuda.device_count()}")
    print(f"Available GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Python version: 3.9.9
PyTorch version: 2.3.1+cu121
CUDA available: True
CUDA version: 12.1
GPU model: Tesla T4
Number of GPUs: 1
Available GPU memory: 15.64 GB
Using device: cuda


In [3]:
print("Loading SQuAD dataset…")
squad = load_dataset("squad")

Loading SQuAD dataset…


In [4]:
print(squad)
print("Example entry:", squad["train"][0])

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})
Example entry: {'id': '5733be284776f41900661182', 'title': 'University_of_Notre_Dame', 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues an

In [5]:
# Loading model & tokenizer
model_name = "distilbert-base-uncased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name).to(device)
#model = torch.compile(model)  # requires PyTorch 2+

In [6]:
# Preprocessing SQuAD examples
max_length = 384   
doc_stride = 128   

def prepare_train_features(examples):
    tokenized = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized.pop("offset_mapping")

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        sample_idx = sample_mapping[i]
        answers = examples["answers"][sample_idx]
        if len(answers["answer_start"]) == 0:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])
            token_start_index, token_end_index = None, None
            sequence_ids = tokenized.sequence_ids(i)
            context_start = sequence_ids.index(1)
            context_end = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)
            for idx in range(context_start, context_end + 1):
                if offsets[idx][0] <= start_char < offsets[idx][1]:
                    token_start_index = idx
                    break
            for idx in range(context_end, context_start - 1, -1):
                if offsets[idx][0] < end_char <= offsets[idx][1]:
                    token_end_index = idx
                    break
            if token_start_index is None or token_end_index is None:
                token_start_index = cls_index
                token_end_index = cls_index

            start_positions.append(token_start_index)
            end_positions.append(token_end_index)

    tokenized["start_positions"] = start_positions
    tokenized["end_positions"] = end_positions
    return tokenized

# Using datasets.map to preprocess train and validation
train_dataset = squad["train"].map(
    prepare_train_features,
    batched=True,
    remove_columns=squad["train"].column_names,
)
eval_dataset = squad["validation"].map(
    prepare_train_features,
    batched=True,
    remove_columns=squad["validation"].column_names,
)

# Converting to PyTorch DataLoader
class QADataset(Dataset):
    def __init__(self, hf_dataset):
        self.dataset = hf_dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        example = self.dataset[idx]
        return {k: torch.tensor(v) for k, v in example.items()}

In [7]:
# Fine Tuning

# DataLoader 
train_loader = DataLoader(
    QADataset(train_dataset),
    batch_size=32,          
    shuffle=True,
    pin_memory=True,
    num_workers=8,
    persistent_workers=True
)
eval_loader = DataLoader(
    QADataset(eval_dataset),
    batch_size=32,
    pin_memory=True,
    num_workers=8,
    drop_last=True
)

# Optimizer
epochs = 2
learning_rate = 3e-5
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = int(0.1 * total_steps),
    num_training_steps = total_steps
)
scaler = GradScaler()

# Training loop 
model.train()
for epoch in range(epochs):
    total_loss = 0.0

    for batch in tqdm(train_loader, desc=f"Train Epoch {epoch+1}"):
        optimizer.zero_grad()

        input_ids        = batch["input_ids"].to(device)
        attention_mask   = batch["attention_mask"].to(device)
        start_positions  = batch["start_positions"].to(device)
        end_positions    = batch["end_positions"].to(device)

        with autocast():  
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                start_positions=start_positions,
                end_positions=end_positions
            )
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f"\nEpoch {epoch+1} avg train loss: {avg_train_loss:.4f}")

    # Quick validation on first 20 batches
    model.eval()
    val_loss = 0.0
    val_steps = 0
    with torch.no_grad():
        for batch in list(eval_loader)[:20]:
            input_ids       = batch["input_ids"].to(device)
            attention_mask  = batch["attention_mask"].to(device)
            start_positions = batch["start_positions"].to(device)
            end_positions   = batch["end_positions"].to(device)

            with autocast():
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    start_positions=start_positions,
                    end_positions=end_positions
                )
            val_loss += outputs.loss.item()
            val_steps += 1

    avg_val_loss = val_loss / val_steps
    print(f"Epoch {epoch+1} quick val loss: {avg_val_loss:.4f}\n")

    model.train()


Train Epoch 1: 100%|████████████████████████████████████████████████████████| 2767/2767 [12:27<00:00,  3.70it/s]


Epoch 1 avg train loss: 0.7485





Epoch 1 quick val loss: 1.1257



Train Epoch 2: 100%|████████████████████████████████████████████████████████| 2767/2767 [12:25<00:00,  3.71it/s]


Epoch 2 avg train loss: 0.5576





Epoch 2 quick val loss: 1.1752



In [None]:
'''
# 1. Save the model & tokenizer
model.save_pretrained("checkpoint/reader_model")
tokenizer.save_pretrained("checkpoint/reader_tokenizer")

# 2. Save optimizer & scheduler state, plus current epoch
import torch, os
os.makedirs("checkpoint", exist_ok=True)

torch.save({
    "epoch": epoch,                         
    "model_state_dict": model.state_dict(),
    "optimizer_state_dict": optimizer.state_dict(),
    "scheduler_state_dict": scheduler.state_dict(),
}, "checkpoint/reader_training.pt")

print("Checkpoint saved!")

# 1) Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 2) Load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained("checkpoint/reader_tokenizer")
model     = AutoModelForQuestionAnswering.from_pretrained("checkpoint/reader_model")
model     = model.to(device)

# 3) Create optimizer & scheduler with same settings
learning_rate = 3e-5
epochs        = 2        
train_loader = DataLoader(
    QADataset(train_dataset),
    batch_size=16,            
    shuffle=True,
    pin_memory=True,          
    num_workers=4             
)  
total_steps   = len(train_loader) * epochs

optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = int(0.1 * total_steps),
    num_training_steps = total_steps
)

# 4) Load the training state
ckpt = torch.load("checkpoint/reader_training.pt", map_location=device)
model.load_state_dict(ckpt["model_state_dict"])
optimizer.load_state_dict(ckpt["optimizer_state_dict"])
scheduler.load_state_dict(ckpt["scheduler_state_dict"])
start_epoch = ckpt["epoch"] + 1
'''

In [None]:
'''eval_loader = DataLoader(
    QADataset(eval_dataset),
    batch_size=8,
    drop_last=True,
    pin_memory=True,
    num_workers=4
)'''

In [8]:
# ──────────── Full Batched QA Evaluation────────────

# QA pipeline 
qa_pipe = pipeline(
    "question-answering",
    model=model,        
    tokenizer=tokenizer,
    device=device,
    handle_impossible_answer=False,
    batch_size=64
)

# Load Data
raw_eval = load_dataset("squad", split="validation")
questions = raw_eval["question"]
contexts  = raw_eval["context"]

# Runnig
preds = qa_pipe(question=questions, context=contexts)
predictions = [
    {"id": ex["id"], "prediction_text": out["answer"]}
    for ex, out in zip(raw_eval, preds)
]
references = [
    {"id": ex["id"], "answers": ex["answers"]}
    for ex in raw_eval
]

metric  = load_metric("squad")
results = metric.compute(predictions=predictions, references=references)
print(f"Exact Match: {results['exact_match']:.2f}")
print(f"F1 Score    : {results['f1']:.2f}\n")

# Example Prediction
i = 0  # index of the example to display
print("Example #1:")
print("Question:        ", questions[i])
print("Context excerpt: ", contexts[i])
print("Gold Answers:    ", raw_eval[i]["answers"]["text"])
print("Model Prediction:", preds[i]["answer"])

i+=1
print("Example #2:")
print("Question:        ", questions[i])
print("Context excerpt: ", contexts[i])
print("Gold Answers:    ", raw_eval[i]["answers"]["text"])
print("Model Prediction:", preds[i]["answer"])

i+=1
print("Example #3:")
print("Question:        ", questions[i])
print("Context excerpt: ", contexts[i])
print("Gold Answers:    ", raw_eval[i]["answers"]["text"])
print("Model Prediction:", preds[i]["answer"])

Device set to use cuda


Exact Match: 75.43
F1 Score    : 84.12

Example #1:
Question:         Which NFL team represented the AFC at Super Bowl 50?
Context excerpt:  Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi's Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.
Gold Answers:     ['Denver Broncos', 'Denver Broncos', 'Denver Broncos']
Model Pred

In [9]:
# Saving Model
'''
save_dir = "final_qa_model"
os.makedirs(save_dir, exist_ok=True)
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print(f"Model & tokenizer saved to `{save_dir}/`")
'''

Model & tokenizer saved to `final_qa_model/`


# Loading the model

In [3]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("final_qa_model")
model     = AutoModelForQuestionAnswering.from_pretrained("final_qa_model")
model     = model.to(device)
print("Model reloaded and ready!")


Model reloaded and ready!


In [3]:
# Ensuring that model working properly
from datasets import load_dataset
from evaluate import load as load_metric
from transformers import pipeline
model.eval()

qa_pipe = pipeline(
    "question-answering",
    model=model,
    tokenizer=tokenizer,
    device=device,
    handle_impossible_answer=False,
    batch_size=64
)

raw_eval = load_dataset("squad", split="validation")
questions = raw_eval["question"]
contexts  = raw_eval["context"]

preds = qa_pipe(question=questions, context=contexts)

predictions = [
    {"id": ex["id"], "prediction_text": out["answer"]}
    for ex, out in zip(raw_eval, preds)
]
references = [
    {"id": ex["id"], "answers": ex["answers"]}
    for ex in raw_eval
]

metric  = load_metric("squad")
results = metric.compute(predictions=predictions, references=references)
print(f"Exact Match: {results['exact_match']:.2f}")
print(f"F1 Score    : {results['f1']:.2f}\n")

for i in range(3):
    print(f"Example #{i+1}:")
    print(" Question:       ", questions[i])
    print(" Context excerpt:", contexts[i][:200].replace("\n", " "), "…")
    print(" Gold Answers:   ", raw_eval[i]["answers"]["text"])
    print(" Model Prediction:", preds[i]["answer"])
    print("-" * 80)

Device set to use cuda


Exact Match: 78.67
F1 Score    : 86.55

Example #1:
 Question:        Which NFL team represented the AFC at Super Bowl 50?
 Context excerpt: Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated …
 Gold Answers:    ['Denver Broncos', 'Denver Broncos', 'Denver Broncos']
 Model Prediction: Denver Broncos
--------------------------------------------------------------------------------
Example #2:
 Question:        Which NFL team represented the NFC at Super Bowl 50?
 Context excerpt: Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated …
 Gold Answers:    ['Carolina Panthers', 'Carolina Panthers', 'Carolina Panthers']
 Model Prediction: Carolina Panthers
---------------------------------------------

# Utilizing entire pipeline

In [4]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from transformers import AutoTokenizer, AutoModel
import re
import nltk
from tqdm import tqdm
from datasets import load_dataset
import numpy as np
from transformers import pipeline

In [5]:
# Loading retriever model
encoder_path   = "model2/encoder"
tokenizer_path = "model2/tokenizer"
tokenizer_retriever = AutoTokenizer.from_pretrained(tokenizer_path)
retrieve_model      = AutoModel.from_pretrained(encoder_path).to(device)
retrieve_model.eval()
print("Retriever model and tokenizer loaded.")

Retriever model and tokenizer loaded.


In [10]:
# Rebuilding cleaned_query_passage_pairs.parquet (same code in used in retriever model)
nltk.download('punkt_tab')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')


try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

def clean_text(text):
    if text is None:
        return ""
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def split_passages(text, min_tokens=80, max_tokens=300):
    from nltk.tokenize import sent_tokenize
    sentences = sent_tokenize(text)
    passages = []
    current = ""
    for sent in sentences:
        if len(current.split()) + len(sent.split()) <= max_tokens:
            current += " " + sent
        else:
            if len(current.split()) >= min_tokens:
                passages.append(current.strip())
            current = sent
    if len(current.split()) >= min_tokens:
        passages.append(current.strip())
    return passages

print("Loading MS MARCO train set…")
dataset = load_dataset("ms_marco", "v2.1", split="train")

queries, passages = [], []
print("Extracting and splitting passages…")
for item in tqdm(dataset, desc="MS MARCO items"):
    query = item.get("query", "")
    pinfo = item.get("passages", {})
    for is_sel, ptext in zip(pinfo.get("is_selected", []), pinfo.get("passage_text", [])):
        if is_sel == 1:
            txt = clean_text(ptext)
            for chunk in split_passages(txt):
                queries.append(query)
                passages.append(chunk)
'''
df = pd.DataFrame({"query": queries, "passage": passages})
df.to_parquet("cleaned_query_passage_pairs.parquet", index=False)
print(f"Saved {len(passages)} passages to cleaned_query_passage_pairs.parquet")'''

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/tgs2126/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /home/tgs2126/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/tgs2126/nltk_data...
[nltk_data] Downloading package omw-1.4 to /home/tgs2126/nltk_data...


Loading MS MARCO train set…
Extracting and splitting passages…


MS MARCO items: 100%|█████████████████████████████████████████████████| 808731/808731 [03:13<00:00, 4184.41it/s]


Saved 99394 passages to cleaned_query_passage_pairs.parquet


In [6]:
# 1) Loading data
df_passages = pd.read_parquet("cleaned_query_passage_pairs.parquet")
passages = df_passages["passage"].tolist()

# 2) embedding function using our retriever model
def embed_texts(texts, batch_size=64):
    embeddings = []
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc="Embedding passages"):
            batch = texts[i : i + batch_size]
            inputs = tokenizer_retriever(
                batch,
                padding="max_length",
                truncation=True,
                max_length=256,
                return_tensors="pt"
            ).to(device)
            outputs = retrieve_model(**inputs)
            emb = outputs.last_hidden_state.mean(dim=1).cpu()
            embeddings.append(emb)
    return torch.cat(embeddings, dim=0).numpy()

# 3) We embed all passages
passage_embeddings = embed_texts(passages)

# 4) We build the FAISS index
top_k = 5
index = NearestNeighbors(n_neighbors=top_k, metric="cosine")
index.fit(passage_embeddings)
print("Passages embedded and index built.")

Embedding passages: 100%|███████████████████████████████████████████████████| 1554/1554 [12:18<00:00,  2.10it/s]


Passages embedded and index built.


In [7]:
# We define the same embed_texts helper for queries
def embed_queries(queries, batch_size=64):
    embeddings = []
    with torch.no_grad():
        for i in tqdm(range(0, len(queries), batch_size), desc="Embedding queries"):
            batch = queries[i : i + batch_size]
            inputs = tokenizer_retriever(
                batch,
                padding="max_length",
                truncation=True,
                max_length=256,
                return_tensors="pt"
            ).to(device)
            outputs = retrieve_model(**inputs)
            emb = outputs.last_hidden_state.mean(dim=1).cpu()
            embeddings.append(emb)
    return torch.cat(embeddings, dim=0).numpy()

In [8]:
# Queries of interest
example_queries = [
    "What causes rainbows to form?",
    "Who wrote the novel 1984?",
    "How do neural networks learn?"
]

# We embed them
query_embeddings = embed_queries(example_queries)

# We retrieve top-k passages per query
distances, indices = index.kneighbors(query_embeddings, return_distance=True)
for qi, query in enumerate(example_queries):
    print(f"\nQuery: {query}")
    for rank, idx in enumerate(indices[qi]):
        print(f"  Top {rank+1} (score={1-distances[qi][rank]:.3f}): {passages[idx][:200]}…")


Embedding queries: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 30.24it/s]



Query: What causes rainbows to form?
  Top 1 (score=0.643): A rainbow is a meteorological phenomenon that is caused by reflection, refraction and dispersion of light in water droplets resulting in a spectrum of light appearing in the sky. It takes the form of …
  Top 2 (score=0.629): A rainbow is a meteorological phenomenon that is caused by reflection, refraction and dispersion of light in water droplets resulting in a spectrum of light appearing in the sky.It takes the form of a…
  Top 3 (score=0.620): A rainbow is caused by the refraction and internal reflection of light inside rain drops, which results in the white sunlight being separated out into the colours of the rainbow. See the detailed page…
  Top 4 (score=0.578): The longer wavelength coloured light, such as red, has a large rainbow angle, then the short wavelength colours, such as blue. The index of refraction of light in water is a measure of the speed of li…
  Top 5 (score=0.575): Split and merge into it. Answer by Sedu

# Testing the entire pipeline

In [36]:
my_queries = [
    "What is the capital of France?",
    "What is the chemical formula for water?",
    "How far is the Moon from Earth?",
    "When did the first human walk on the Moon?"
]

In [39]:
# 1) Embeding
query_embeddings = embed_queries(my_queries)

# 2) Retrieving top-k passages
distances, indices = index.kneighbors(query_embeddings, return_distance=True)

# 3) Runing the reader on each query’s passages
for qi, query in enumerate(my_queries):
    candidate_passages = [passages[idx] for idx in indices[qi]]
    qa_inputs = {
        "question": [query] * top_k,
        "context" : candidate_passages
    }
    outputs = qa_pipe(**qa_inputs)

    best = max(outputs, key=lambda x: x["score"])
    print(f"\nQuery: {query}")
    print(f"Answer: {best['answer']} (score={best['score']:.3f})")
    print("From passage:", candidate_passages[outputs.index(best)], "\n")


Embedding queries: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 23.49it/s]



Query: What is the capital of France?
Answer: Paris (score=0.993)
From passage: France spans 643,801 square kilometres (248,573 sq mi) and has a total population of 66.7 million. It is a unitary semi-presidential republic with the capital in Paris, the country's largest city and main cultural and commercial centre. During the Iron Age, what is now metropolitan France was inhabited by the Gauls, a Celtic people. The area was annexed in 51 BC by Rome, which held Gaul until 486, when the Germanic Franks conquered the region and formed the Kingdom of France. 


Query: What is the chemical formula for water?
Answer: H 2 O (score=0.974)
From passage: Answer: Yes, water is a compound. A compound forms whenever two or more atoms form chemical bonds with each other. The chemical formula for water is H 2 O, which means each molecule of water consists of one oxygen atom chemically bonded to two hydrogen atoms.Thus, water is a compound.It's also a molecule, which is any chemical species formed by

# Stress-test for showing model vulnerability

In [40]:
# These are only intended to demonstrate the model's vulnerability to more 
# difficult sequences and will help us understand how we can improve it in the future.
my_queries = [
    "Who was the second person to walk on the Moon, and what city was he born in?",
    "Which Nobel Prize in Literature laureate was born the same year the RMS Titanic sank?",
    "When did she win her first Olympic gold?",
    "How many countries share a land border with Germany?",
    "Which chemical element has an atomic number equal to the number of letters in its English name?"
]

In [41]:
# 1) Embeding
query_embeddings = embed_queries(my_queries)

# 2) Retrieving top-k passages
distances, indices = index.kneighbors(query_embeddings, return_distance=True)

# 3) Runing the reader on each query’s passages
for qi, query in enumerate(my_queries):
    candidate_passages = [passages[idx] for idx in indices[qi]]
    qa_inputs = {
        "question": [query] * top_k,
        "context" : candidate_passages
    }
    outputs = qa_pipe(**qa_inputs)

    best = max(outputs, key=lambda x: x["score"])
    print(f"\nQuery: {query}")
    print(f"Answer: {best['answer']} (score={best['score']:.3f})")
    print("From passage:", candidate_passages[outputs.index(best)], "\n")

Embedding queries: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 20.03it/s]



Query: Who was the second person to walk on the Moon, and what city was he born in?
Answer: Roald Amundsen (score=0.598)
From passage: The Norwegian explorer Roald Amundsen was one of the most important people in the history of polar exploration. He traveled to both the Arctic and the Antarctic, and he was the first person to reach the South Pole. (1872-1928). One of the most important figures in the history of polar exploration was Roald Amundsen. He was the first person to reach the South Pole, the first to sail through the Northwest Passage, and the first to fly over the North Pole. 


Query: Which Nobel Prize in Literature laureate was born the same year the RMS Titanic sank?
Answer: Jose Echegaray (score=0.849)
From passage: The news that she had been awarded the Nobel Prize for Literature came when the Chilean poet was serving as a consul in the city of Petropolis, Brazil. In 1945, Gabriela Mistral became the first Latin American to ever win the Nobel Prize for Literature.owever

# Final Pipeline Performance Evaluation

In [9]:
# 1) Device
device = 0 if torch.cuda.is_available() else -1

# 2) Retrieval settings
recall_k1 = 5
recall_k2 = 20

# 3) Loading SQuAD validation
squad    = load_dataset("squad", split="validation")
questions = squad["question"]
contexts   = squad["context"]
answers    = squad["answers"]
n          = len(questions)

# 4) Building retrieval index 
context_embeddings = embed_texts(contexts)  
index = NearestNeighbors(n_neighbors=recall_k2, metric="cosine")
index.fit(context_embeddings)

# 5) Retrieve top 20
query_embeddings = embed_queries(questions)
distances, indices = index.kneighbors(query_embeddings, return_distance=True)

# 6) Recall@5 and Recall@20
recall5  = np.mean([1 if i in idxs[:recall_k1] else 0 for i, idxs in enumerate(indices)])
recall20 = np.mean([1 if i in idxs else 0 for i, idxs in enumerate(indices)])
rr20 = [
    1.0/(idxs.tolist().index(i)+1)  
    if i in idxs else 0.0           
    for i, idxs in enumerate(indices)
]
mrr20 = np.mean(rr20)

print(f"Retrieval Recall@{recall_k1}: {recall5:.3f}")
print(f"Retrieval Recall@{recall_k2}: {recall20:.3f}")
print(f"Retrieval MRR@{recall_k2}:    {mrr20:.3f}")

# 7) QA pipeline
qa_pipe = pipeline(
    "question-answering",
    model=model,
    tokenizer=tokenizer,
    device=device,
    handle_impossible_answer=False,
    batch_size=recall_k2  
)

# 8) EM/F1
predictions = []
for i in tqdm(range(n), desc="End-to-end QA"):
    q     = questions[i]
    cands = [contexts[j] for j in indices[i][:recall_k1]]
    outs  = qa_pipe(question=[q]*recall_k1, context=cands)
    best  = max(outs, key=lambda x: x["score"])
    predictions.append({"id": squad[i]["id"], "prediction_text": best["answer"]})

references = [{"id": squad[i]["id"], "answers": answers[i]} for i in range(n)]
metric     = load_metric("squad")
results    = metric.compute(predictions=predictions, references=references)

print(f"Exact Match: {results['exact_match']:.2f}")
print(f"F1 Score    : {results['f1']:.2f}")

Embedding passages: 100%|█████████████████████████████████████████████████████| 166/166 [01:21<00:00,  2.05it/s]
Embedding queries: 100%|██████████████████████████████████████████████████████| 166/166 [01:17<00:00,  2.15it/s]
Device set to use cuda:0


Retrieval Recall@5: 0.503
Retrieval Recall@20: 0.742
Retrieval MRR@20:    0.266


End-to-end QA:   0%|                                                          | 6/10570 [00:00<06:26, 27.35it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
End-to-end QA: 100%|██████████████████████████████████████████████████████| 10570/10570 [05:45<00:00, 30.62it/s]


Exact Match: 44.50
F1 Score    : 50.70
