Minimal open-source RAG retriever:
- chunk text
- embed with sentence-transformers (bi-encoder)
- index with FAISS
- retrieve top-N, re-rank with cross-encoder

In [4]:
!pip install -U \
    torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

!pip install -U \
    sentence-transformers \
    transformers \
    langchain \
    tiktoken

Looking in indexes: https://download.pytorch.org/whl/cu121
INFO: pip is looking at multiple versions of torch to determine which version is compatible with other requirements. This could take a while.
Collecting torch
  Downloading https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp312-cp312-linux_x86_64.whl (780.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m780.4/780.4 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading https://download.pytorch.org/whl/cu121/nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m87.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading https://download.pytorch.org/whl/cu121/nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [5]:
!pip install faiss-gpu-cu12[fix-cuda]



In [1]:
from sentence_transformers import SentenceTransformer, CrossEncoder
import faiss
import numpy as np
import math
import json
import os

In [2]:
# ---- Helpers: simple chunker ----
def chunk_text(text, max_tokens=500, overlap_tokens=100, tokenizer=None):
    # Simple whitespace-token-based chunker; replace with tiktoken for exact token counts.
    words = text.split()
    chunks = []
    i = 0
    while i < len(words):
        chunk = words[i:i+max_tokens]
        chunks.append(" ".join(chunk))
        i += max_tokens - overlap_tokens
    return chunks

In [3]:
# ---- Example documents ----
docs = [
    {"id": "doc1", "text": "Artificial intelligence is the simulation of human intelligence processes by machines."},
    {"id": "doc2", "text": "FAISS is an open source library for efficient similarity search and clustering of dense vectors."},
    {"id": "doc3", "text": "Milvus is a vector database that helps store and retrieve embeddings at scale."}
]

In [4]:
# 1) Chunk docs
chunks = []
meta = []
for d in docs:
    c = chunk_text(d["text"], max_tokens=50, overlap_tokens=10)
    for i, chunk in enumerate(c):
        chunks.append(chunk)
        meta.append({"doc_id": d["id"], "chunk_id": f"{d['id']}_c{i}", "text": chunk})

In [10]:

# 2) Embedding model (bi-encoder)
bi_model_name = "sentence-transformers/all-MiniLM-L6-v2"  # fast; swap to all-mpnet-base-v2 for quality
embedder = SentenceTransformer(bi_model_name)
embeddings = embedder.encode([m["text"] for m in meta], show_progress_bar=True, convert_to_numpy=True)
dim = embeddings.shape[1]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [12]:
embeddings.shape

(3, 384)

In [13]:
# 3) Build FAISS index (L2 + normalize for cosine sim)
faiss.normalize_L2(embeddings)

In [15]:
index = faiss.IndexFlatIP(dim)  # inner product on normalized vectors is cosine sim
index.add(embeddings)

In [16]:
# Save metadata for lookup
meta_map = {i: meta[i] for i in range(len(meta))}

In [20]:
def retrieve(query, top_k=5):
    # Encode the query
    q_emb = embedder.encode([query], convert_to_numpy=True)
    # Normalize if your FAISS index expects normalized embeddings
    faiss.normalize_L2(q_emb)
    # Search
    D, I = index.search(q_emb, top_k)  # D = distances (or scores), I = indices
    # Optional: Debug info
    print("Top-k distances:", D)
    print("Top-k indices:", I)
    # Collect results
    results = []
    for score, idx in zip(D[0], I[0]):
        if idx == -1:
            continue
        m = meta_map[idx]
        results.append({
            "score": float(score),
            "meta": m
        })
    return results, I[0]

In [21]:
# 5) Re-rank with cross-encoder
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
def rerank(query, candidates):
    pairs = [[query, c["meta"]["text"]] for c in candidates]
    scores = reranker.predict(pairs)  # higher => better
    for c, s in zip(candidates, scores):
        c["rerank_score"] = float(s)
    candidates_sorted = sorted(candidates, key=lambda x: x["rerank_score"], reverse=True)
    return candidates_sorted

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

In [22]:
# Example usage
query = "What is FAISS?"
cands, _ = retrieve(query, top_k=10)

Top-k distances: [[ 2.7626631e-01  9.0487942e-02  6.0679823e-02 -3.4028235e+38
  -3.4028235e+38 -3.4028235e+38 -3.4028235e+38 -3.4028235e+38
  -3.4028235e+38 -3.4028235e+38]]
Top-k indices: [[ 1  2  0 -1 -1 -1 -1 -1 -1 -1]]


In [23]:
cands

[{'score': 0.27626630663871765,
  'meta': {'doc_id': 'doc2',
   'chunk_id': 'doc2_c0',
   'text': 'FAISS is an open source library for efficient similarity search and clustering of dense vectors.'}},
 {'score': 0.09048794209957123,
  'meta': {'doc_id': 'doc3',
   'chunk_id': 'doc3_c0',
   'text': 'Milvus is a vector database that helps store and retrieve embeddings at scale.'}},
 {'score': 0.06067982316017151,
  'meta': {'doc_id': 'doc1',
   'chunk_id': 'doc1_c0',
   'text': 'Artificial intelligence is the simulation of human intelligence processes by machines.'}}]

In [24]:
reranked = rerank(query, cands)

In [25]:
reranked

[{'score': 0.27626630663871765,
  'meta': {'doc_id': 'doc2',
   'chunk_id': 'doc2_c0',
   'text': 'FAISS is an open source library for efficient similarity search and clustering of dense vectors.'},
  'rerank_score': 10.190886497497559},
 {'score': 0.06067982316017151,
  'meta': {'doc_id': 'doc1',
   'chunk_id': 'doc1_c0',
   'text': 'Artificial intelligence is the simulation of human intelligence processes by machines.'},
  'rerank_score': -8.95848274230957},
 {'score': 0.09048794209957123,
  'meta': {'doc_id': 'doc3',
   'chunk_id': 'doc3_c0',
   'text': 'Milvus is a vector database that helps store and retrieve embeddings at scale.'},
  'rerank_score': -9.310111045837402}]

In [26]:
for r in reranked[:5]:
    print("---")
    print("doc:", r["meta"]["doc_id"], r["meta"]["chunk_id"])
    print("text:", r["meta"]["text"])
    print("bi_score:", r["score"], "rerank_score:", r["rerank_score"])

---
doc: doc2 doc2_c0
text: FAISS is an open source library for efficient similarity search and clustering of dense vectors.
bi_score: 0.27626630663871765 rerank_score: 10.190886497497559
---
doc: doc1 doc1_c0
text: Artificial intelligence is the simulation of human intelligence processes by machines.
bi_score: 0.06067982316017151 rerank_score: -8.95848274230957
---
doc: doc3 doc3_c0
text: Milvus is a vector database that helps store and retrieve embeddings at scale.
bi_score: 0.09048794209957123 rerank_score: -9.310111045837402
