# 🔍 FAISS Similarity Search with Sentence Transformers + RAG

In [None]:
!pip install sentence-transformers pymupdf faiss-cpu openai


In [None]:
import requests
import fitz  # PyMuPDF
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import getpass
import os
import openai


In [None]:
# --- Step 1: Download PDFs from arXiv ---
pdf_urls = [
    "https://arxiv.org/pdf/2401.15884",
    "https://arxiv.org/pdf/2005.11401"
]

local_paths = []
for i, url in enumerate(pdf_urls):
    response = requests.get(url)
    filename = f"paper_{i}.pdf"
    with open(filename, "wb") as f:
        f.write(response.content)
    local_paths.append(filename)

print(f"Downloaded {len(local_paths)} PDFs.")


In [None]:
# --- Step 2: Extract text using PyMuPDF ---
def extract_text_from_pdf(path):
    doc = fitz.open(path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

texts = [extract_text_from_pdf(path) for path in local_paths]
full_text = "\n".join(texts)
print(f"Total extracted characters: {len(full_text)}")


In [None]:
# --- Step 3: Split text into overlapping chunks ---
def split_text(text, chunk_size=500, overlap=100):
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunks.append(text[start:end])
        start += chunk_size - overlap
    return chunks

chunks = split_text(full_text)
print(f"Generated {len(chunks)} chunks.")


In [None]:
# --- Step 4: Generate embeddings ---
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(chunks, convert_to_numpy=True, show_progress_bar=True)


In [None]:
# --- Step 5: Build FAISS index ---
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
print(f"FAISS index built with {index.ntotal} vectors.")


In [None]:
# --- Step 6: Similarity search ---
def search(query, k=3):
    query_embedding = model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, k)
    print(f"\nTop {k} results for: '{query}'\n")
    for i, idx in enumerate(indices[0]):
        print(f"Result {i+1} (score={distances[0][i]:.2f}):\n{chunks[idx][:500]}\n{'-'*80}")


In [None]:
# --- Step 7: Try some sample questions ---
sample_questions = [
    "What is RAG and how does it work?",
    "What is the difference between RAG-Sequence and RAG-Token?",
    "How does RAG use non-parametric memory?",
    "What tasks were used to evaluate RAG?",
    "How is Dense Passage Retrieval (DPR) used in RAG?",
    "What is the advantage of hybrid models over purely parametric models?",
    "What decoding strategies are used in RAG?",
    "How does RAG compare to T5 and BART?",
    "What datasets were used to benchmark RAG models?",
    "Can RAG models be updated without retraining?"
]

for q in sample_questions:
    search(q, k=2)


In [None]:

# Securely prompt for your OpenAI API key
if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

openai.api_key = os.getenv("OPENAI_API_KEY")

def rag_answer(question, k=2, model_name="gpt-4o"):
    query_embedding = model.encode([question], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, k)
    retrieved_context = "\n\n".join([chunks[i] for i in indices[0]])

    prompt = f"Context:\n{retrieved_context}\n\nAnswer this Question based only on the provided context: {question}\nAnswer:"

    response = openai.chat.completions.create(
        model=model_name,
        messages=[
            {"role": "system", "content": "You are a helpful assistant that answers questions based only on the provided context."},
            {"role": "user", "content": prompt}
        ],
        temperature=0
    )

    answer = response.choices[0].message.content
    print(f"\nQuestion: {question}\nAnswer: {answer}")

# 🔍 Try it out with a real question
rag_answer("What is retrieval augmented generation?")
