# QA Comparison: BERT-style Extractive vs RAG-style Generative

In [None]:
# !pip install transformers torch scikit-learn

In [6]:
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering, AutoModel, AutoModelForSeq2SeqLM
from sklearn.metrics.pairwise import cosine_similarity
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


## Example text and question

In [59]:
context = '''ProductX is the latest widget released in 2024. It features improved battery life.
To reset ProductX, hold the power button for 10 seconds until the LED blinks.
Our support plans include Basic, Plus, and Enterprise tiers. ''' 
# context = [
#     "ProductX is the latest widget released in 2024. It features improved battery life.",
#     "To reset ProductX, hold the power button for 10 seconds until the LED blinks.",
#     "Our support plans include Basic, Plus, and Enterprise tiers, offering 24/7 support in higher tiers."
# ]
question = 'How many plans are there   ?'

## 🔶 Part 1: BERT-style extractive QA

In [60]:
bert_qa = pipeline('question-answering', model='distilbert-base-cased-distilled-squad') 
bert_result = bert_qa(question=question, context=context) 
print(f"Extractive Answer: {bert_result['answer']}")

Extractive Answer: Basic, Plus, and Enterprise tiers


## 🟢 Part 2: RAG-style semantic retrieval + generative QA

In [61]:
# Embedding model\n
embed_model_name = 'sentence-transformers/all-MiniLM-L6-v2' 
embed_tokenizer = AutoTokenizer.from_pretrained(embed_model_name) 
embed_model = AutoModel.from_pretrained(embed_model_name)

In [62]:
def mean_pooling(model_output, attention_mask): 
    token_embeddings = model_output[0]  
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() 
    return (token_embeddings * input_mask_expanded).sum(1) / input_mask_expanded.sum(1)

In [63]:
def embed(text): 
    tokens = embed_tokenizer([text], return_tensors='pt', padding=True, truncation=True) 
    with torch.no_grad(): 
        output = embed_model(**tokens) 
    return mean_pooling(output, tokens['attention_mask']).numpy()

In [64]:
# Split context into sentences\n
sentences = context.strip().split(".") 
sentence_embeddings = np.vstack([embed(s) for s in sentences]) 
query_embedding = embed(question)[0]
print("query shape:", query_embedding.shape)
print("sentences shape:", sentence_embeddings.shape)

similarities = cosine_similarity([query_embedding], sentence_embeddings)[0]  
best_idx = int(np.argmax(similarities)) 
retrieved = sentences[best_idx].strip() 
print(f"Retrieved for RAG: {retrieved}")

query shape: (384,)
sentences shape: (5, 384)
Retrieved for RAG: Our support plans include Basic, Plus, and Enterprise tiers


In [65]:
query_embedding.shape
sentence_embeddings.shape

(5, 384)

In [66]:
# Generation model\n
gen_pipeline = pipeline('text2text-generation', model='google/flan-t5-base') 
prompt = f"Context: {retrieved} Question: {question} Answer:" 
gen_result = gen_pipeline(prompt, max_length=100)[0]['generated_text'] 
print(f"Generative Answer: {gen_result}")



Generative Answer: three
