
# Minimal RAG: chunk → embed → index (FAISS) → retrieve → answer

**What you'll do**
1. Load sample text files from `sample_data/`
2. Chunk the text
3. Compute embeddings with `sentence-transformers`
4. Index vectors in FAISS and retrieve top-k chunks for a query
5. Compose an answer (with or without an LLM)

> Works offline (no API key) for retrieval; adds LLM answer if `OPENAI_API_KEY` is set.


In [None]:

# If running in fresh envs, you can install these here:
# %pip install -q sentence-transformers faiss-cpu numpy pandas python-dotenv openai

import os, glob, re
import numpy as np
from dotenv import load_dotenv

load_dotenv()

# Embeddings
from sentence_transformers import SentenceTransformer
import faiss

# Optional LLM
USE_OPENAI = bool(os.getenv("OPENAI_API_KEY"))
if USE_OPENAI:
    from openai import OpenAI
    client = OpenAI()
else:
    client = None

# 1) Load docs
def load_docs(path="sample_data/*.txt"):
    docs = []
    for p in glob.glob(path):
        with open(p, "r", encoding="utf-8") as f:
            docs.append(f.read())
    return docs

docs = load_docs()
print(f"Loaded {len(docs)} docs.")


In [None]:

# 2) Chunking (simple paragraph/sentence splits)
def simple_chunk(text, max_chars=500):
    # split by double newline first, then by sentence-ish punctuation
    paragraphs = [p.strip() for p in text.split("\n\n") if p.strip()]
    chunks = []
    for para in paragraphs:
        buf = ""
        for sent in re.split(r'(?<=[.!?])\s+', para):
            if len(buf) + len(sent) <= max_chars:
                buf += (" " if buf else "") + sent
            else:
                if buf:
                    chunks.append(buf.strip())
                buf = sent
        if buf:
            chunks.append(buf.strip())
    return chunks

all_chunks = []
for d in docs:
    all_chunks.extend(simple_chunk(d, max_chars=400))

print(f"Total chunks: {len(all_chunks)}")


In [None]:

# 3) Embeddings
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
embeddings = model.encode(all_chunks, convert_to_numpy=True, normalize_embeddings=True)
embeddings = embeddings.astype("float32")
print(embeddings.shape)


In [None]:

# 4) FAISS index (Inner Product works with normalized vectors for cosine similarity)
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(embeddings)
print("Indexed vectors:", index.ntotal)


In [None]:

# 5) Retrieval
def retrieve(query, k=5):
    q_emb = model.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype("float32")
    scores, idxs = index.search(q_emb, k)
    results = [(float(scores[0][i]), all_chunks[int(idxs[0][i])]) for i in range(k)]
    return results

query = "What is RAG and why is it useful?"
hits = retrieve(query, k=4)
for s, ch in hits:
    print(f"\n[score={s:.3f}] {ch}")


In [None]:

# 6) Compose final answer (with/without LLM)
context = "\n\n".join([ch for _, ch in hits])

if client is None:
    print("\n=== Template Answer (no LLM) ===")
    print("Question:", query)
    print("Key points from retrieved context:")
    print("-", "\n- ".join([ch[:180] + ("..." if len(ch) > 180 else "") for _, ch in hits]))
else:
    prompt = f"""Use the context to answer the question accurately.
If something isn't in the context, say so briefly.

Question: {query}

Context:
{context}
"""
    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role":"system","content":"You answer strictly based on the given context."},
                  {"role":"user","content":prompt}],
        temperature=0.2
    )
    print("\n=== LLM Answer ===\n")
    print(resp.choices[0].message.content)



### Experiments to try next
- Change `max_chars` in chunking and compare retrieval quality.
- Try different k values in `retrieve()`.
- Swap embedding model (e.g., `all-mpnet-base-v2`) and compare.
- Add `Streamlit` UI: a textbox for query + show retrieved chunks + answer.
