In [29]:
print("s")

s


In [30]:
# Installs (safe to re-run). Jupyter's %pip keeps the same interpreter.
%pip install -qU pinecone python-dotenv langchain langchain-core \
               langchain-text-splitters langchain-pinecone tqdm pandas numpy


Note: you may need to restart the kernel to use updated packages.


In [49]:
# --- Cell 1: Setup & Readiness ------------------------------------------------

# Imports
import os, sys, json, pathlib, importlib
from dotenv import load_dotenv

# Repo roots & data dir
ROOT = pathlib.Path.cwd()
ENV_PATH = ROOT / ".env"

DATA_DIR = pathlib.Path(r"e:/capitalone/CapitalOne-hack/fasal-setu-ai/py/ai_engine/tools/ragdata")

DATA_DIR.mkdir(parents=True, exist_ok=True)

# Load .env
if ENV_PATH.exists():
    load_dotenv(ENV_PATH)
else:
    print(f"WARNING: .env not found at {ENV_PATH}")

# Env vars (supports both legacy ENV and serverless region/cloud)
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV     = os.getenv("PINECONE_ENV")                 # legacy (optional)
PINECONE_CLOUD   = os.getenv("PINECONE_CLOUD", "aws")        # serverless (optional)
PINECONE_REGION  = os.getenv("PINECONE_REGION", PINECONE_ENV or "")
PINECONE_INDEX   = os.getenv("PINECONE_INDEX", "rag-llm1")
PINECONE_NS      = os.getenv("PINECONE_NAMESPACE", "default")
EMBED_MODEL      = os.getenv("EMBED_MODEL", "llama-text-embed-v2")

# Helper to get versions
def _v(pkg):
    try:
        return importlib.import_module(pkg).__version__
    except Exception:
        return "not installed"

# Readiness checks
issues, checks = [], []

# Package versions
checks += [
    f"pinecone: {_v('pinecone')}",
    f"langchain: {_v('langchain')}",
    f"langchain_text_splitters: {_v('langchain_text_splitters')}",
    f"langchain_pinecone: {_v('langchain_pinecone')}",
    f"python-dotenv: {_v('dotenv')}",
]

# Secrets
if PINECONE_API_KEY:
    checks.append("✅ PINECONE_API_KEY set")
else:
    issues.append("❌ Set PINECONE_API_KEY in .env")

# Location (either legacy ENV or serverless region)
if PINECONE_REGION:
    checks.append(f"✅ Pinecone location OK → region={PINECONE_REGION}, cloud={PINECONE_CLOUD}")
else:
    issues.append("❌ Set PINECONE_REGION (and optional PINECONE_CLOUD) or legacy PINECONE_ENV")

checks += [
    f"✅ PINECONE_INDEX = {PINECONE_INDEX}",
    f"✅ NAMESPACE = {PINECONE_NS}",
    f"✅ EMBED_MODEL = {EMBED_MODEL}",
]

# Data dir & file counts
txts  = list(DATA_DIR.rglob("*.txt"))
jsons = list(DATA_DIR.rglob("*.json"))
checks.append(f"✅ Data dir exists: {DATA_DIR}")
checks.append(f"Found {len(txts)} .txt and {len(jsons)} .json files.")

# Pinecone SDK major version hint (recommend 7+)
try:
    import pinecone
    pc_ver = pinecone.__version__
    major = int(pc_ver.split(".")[0])
    if major < 7:
        issues.append("⚠️ pinecone SDK < 7.x detected — consider: %pip install -U pinecone")
except Exception:
    issues.append("⚠️ Could not read pinecone version — ensure package installed")

print("\n=== RAG Tool Readiness ===")
for c in checks: print("•", c)
if issues:
    print("\nIssues:")
    for i in issues: print(" -", i)
else:
    print("\nAll good. Proceed to index & embedding setup.")
# ----------------------------------------------------------------------------- 



=== RAG Tool Readiness ===
• pinecone: 7.3.0
• langchain: 0.3.27
• langchain_text_splitters: not installed
• langchain_pinecone: not installed
• python-dotenv: not installed
• ✅ PINECONE_API_KEY set
• ✅ Pinecone location OK → region=us-east-1-aws, cloud=aws
• ✅ PINECONE_INDEX = capitalone
• ✅ NAMESPACE = default
• ✅ EMBED_MODEL = llama-text-embed-v2
• ✅ Data dir exists: e:\capitalone\CapitalOne-hack\fasal-setu-ai\py\ai_engine\tools\ragdata
• Found 2 .txt and 0 .json files.

All good. Proceed to index & embedding setup.


In [50]:
# --- Cell 2 (fixed): Pinecone init + hosted embeddings + auto index creation --
from typing import List, Dict, Any
from pinecone import Pinecone, ServerlessSpec

# 1) Init Pinecone client
if not PINECONE_API_KEY:
    raise RuntimeError("PINECONE_API_KEY missing. Add it to your .env and re-run Cell 1.")

pc = Pinecone(api_key=PINECONE_API_KEY)

# 2) Robust wrapper around Pinecone Hosted Embeddings (v7+ friendly)
def _as_vectors(embed_out):
    """
    Accepts either:
      - pinecone.inference.EmbeddingsList (v7+)
      - dict with 'data' (older shapes)
      - list-like of rows
    Returns: List[List[float]]
    """
    # Prefer attribute `.data` if present (EmbeddingsList)
    data = getattr(embed_out, "data", None)
    if data is None:
        # Maybe it's a dict with 'data'
        if isinstance(embed_out, dict) and "data" in embed_out:
            data = embed_out["data"]
        else:
            # Otherwise assume it's already a list-like
            data = embed_out

    vectors = []
    for row in data:
        # v7 row objects expose `.values`; dicts expose ['values']
        if hasattr(row, "values"):
            vectors.append(row.values)
        elif isinstance(row, dict) and "values" in row:
            vectors.append(row["values"])
        else:
            raise TypeError(f"Unexpected embedding row type: {type(row)}")
    return vectors

def _embed_with_pinecone(texts: List[str],
                         model: str = EMBED_MODEL,
                         input_type: str = "passage",
                         truncate: str = "END") -> List[List[float]]:
    if not texts:
        return []
    out = pc.inference.embed(
        model=model,
        inputs=texts,
        parameters={"input_type": input_type, "truncate": truncate},
    )
    return _as_vectors(out)

# 3) Infer embedding dimension (before creating the index)
try:
    _probe_vec = _embed_with_pinecone(["dimension-probe"])[0]
    EMBED_DIM = len(_probe_vec)
except Exception as e:
    # Helpful debug if it ever breaks again
    print("Embedding probe failed with type:", type(e).__name__, "→", e)
    raise RuntimeError(
        f"Failed to run hosted embeddings for model={EMBED_MODEL}. "
        f"Check API key, model name, and account access."
    )

print(f"✅ Hosted embedding ready: model={EMBED_MODEL}, dim={EMBED_DIM}")

# 4) Create index if missing (serverless)
existing = {ix["name"] for ix in pc.list_indexes()}
if PINECONE_INDEX not in existing:
    print(f"ℹ️ Creating Pinecone index '{PINECONE_INDEX}' (cosine, dim={EMBED_DIM}) "
          f"on {PINECONE_CLOUD}/{PINECONE_REGION} ...")
    pc.create_index(
        name=PINECONE_INDEX,
        dimension=EMBED_DIM,
        metric="cosine",
        spec=ServerlessSpec(cloud=PINECONE_CLOUD, region=PINECONE_REGION),
    )
else:
    print(f"✅ Pinecone index '{PINECONE_INDEX}' already exists.")

# 5) Get an index handle
index = pc.Index(PINECONE_INDEX)

# 6) Small health check
try:
    stats = index.describe_index_stats()
    print("✅ Index stats:", {k: v for k, v in stats.items() if k in ("dimension", "namespaces", "total_vector_count")})
except Exception as e:
    print("⚠️ Could not fetch index stats:", e)

# Public helpers
def embed_texts(texts: List[str]) -> List[List[float]]:
    return _embed_with_pinecone(texts, model=EMBED_MODEL)

def embedding_info() -> Dict[str, Any]:
    return {"model": EMBED_MODEL, "dimension": EMBED_DIM, "metric": "cosine"}

print("Ready: Pinecone client + embeddings + index handle.")
# -----------------------------------------------------------------------------


✅ Hosted embedding ready: model=llama-text-embed-v2, dim=1024
✅ Pinecone index 'capitalone' already exists.
⚠️ Could not fetch index stats: 'NoneType' object is not callable
Ready: Pinecone client + embeddings + index handle.


In [51]:
# --- Cell 3: Loaders (.txt/.json) + robust chunking --------------------------
from typing import Iterable, Dict, Any, List, Tuple
from langchain_text_splitters import RecursiveCharacterTextSplitter
import hashlib
import pandas as pd

# ---------- helpers ----------
def _hash(s: str) -> str:
    return hashlib.sha1(s.encode("utf-8")).hexdigest()[:12]

def _norm_source(path: pathlib.Path, extra: str = "") -> str:
    rel = path.relative_to(ROOT)
    return f"{rel.as_posix()}{('::' + extra) if extra else ''}"

# ---------- JSON flattening ----------
def _flatten_json(obj: Any, prefix: str = "") -> Iterable[Tuple[str, str]]:
    """
    Yields (path, text_value) pairs.
    - Dicts → keys appended with '/'.
    - Lists → indices appended with '[i]'.
    - Scalars → returned as strings.
    Non-text scalars are stringified.
    """
    if isinstance(obj, dict):
        for k, v in obj.items():
            newp = f"{prefix}/{k}" if prefix else k
            yield from _flatten_json(v, newp)
    elif isinstance(obj, list):
        for i, v in enumerate(obj):
            newp = f"{prefix}[{i}]" if prefix else f"[{i}]"
            yield from _flatten_json(v, newp)
    else:
        # scalar
        text = "" if obj is None else str(obj)
        if text.strip():
            yield prefix, text

# ---------- corpus loading ----------
def load_corpus(data_dir: pathlib.Path = DATA_DIR) -> List[Dict[str, Any]]:
    """
    Returns a list of raw docs:
    {
      'doc_id': str,                  # stable id seed (before chunking)
      'text': str,                    # full text (or json slice)
      'source_stamp': str,            # e.g., py/ai_engine/tools/data/foo.json::crops[2]/name
      'meta': {'path': str, 'kind': 'txt'|'json', 'json_path': str|None}
    }
    """
    docs: List[Dict[str, Any]] = []

    # .txt files
    for path in data_dir.rglob("*.txt"):
        try:
            text = path.read_text(encoding="utf-8", errors="ignore")
        except Exception:
            text = path.read_text(errors="ignore")
        src = _norm_source(path)
        if text.strip():
            docs.append({
                "doc_id": _hash(src),
                "text": text,
                "source_stamp": src,
                "meta": {"path": src, "kind": "txt", "json_path": None}
            })

    # .json files
    for path in data_dir.rglob("*.json"):
        try:
            obj = json.loads(path.read_text(encoding="utf-8", errors="ignore"))
        except Exception as e:
            print(f"⚠️ Failed to parse JSON: {path}: {e}")
            continue
        base = _norm_source(path)
        for jpath, text in _flatten_json(obj):
            src = _norm_source(path, jpath)
            docs.append({
                "doc_id": _hash(src),
                "text": text,
                "source_stamp": src,
                "meta": {"path": base, "kind": "json", "json_path": jpath}
            })

    print(f"Loaded {len(docs)} raw doc items "
          f"({sum(1 for d in docs if d['meta']['kind']=='txt')} txt, "
          f"{sum(1 for d in docs if d['meta']['kind']=='json')} json-slices).")
    return docs

# ---------- chunking ----------
DEFAULT_CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "1000"))
DEFAULT_CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "120"))

splitter = RecursiveCharacterTextSplitter(
    chunk_size=DEFAULT_CHUNK_SIZE,
    chunk_overlap=DEFAULT_CHUNK_OVERLAP,
    length_function=len,
    separators=["\n\n", "\n", " ", ""],
)

def chunk_documents(raw_docs: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Splits raw docs into chunk records:
    {
      'id': str,               # unique per chunk
      'text': str,
      'source_stamp': str,     # preserved
      'metadata': {...}        # ready for Pinecone
    }
    """
    chunks: List[Dict[str, Any]] = []
    for rd in raw_docs:
        parts = splitter.split_text(rd["text"])
        for idx, part in enumerate(parts):
            if not part.strip():
                continue
            cid_seed = f"{rd['doc_id']}::{idx}"
            cid = _hash(cid_seed)
            meta = {
                "source_stamp": rd["source_stamp"],
                "path": rd["meta"]["path"],
                "kind": rd["meta"]["kind"],
                "json_path": rd["meta"]["json_path"],
                "chunk_index": idx,
                "chunk_size": DEFAULT_CHUNK_SIZE,
                "chunk_overlap": DEFAULT_CHUNK_OVERLAP,
                "embed_model": EMBED_MODEL,
            }
            chunks.append({
                "id": cid,
                "text": part,
                "source_stamp": rd["source_stamp"],
                "metadata": meta,
            })
    print(f"Chunked into {len(chunks)} chunks "
          f"(size≈{DEFAULT_CHUNK_SIZE}, overlap={DEFAULT_CHUNK_OVERLAP}).")
    return chunks

# Quick preview utility (optional)
def preview_chunks(chunks: List[Dict[str, Any]], n: int = 5):
    rows = [{
        "id": c["id"],
        "len": len(c["text"]),
        "source_stamp": c["source_stamp"],
        "chunk_index": c["metadata"]["chunk_index"],
        "kind": c["metadata"]["kind"],
    } for c in chunks[:n]]
    try:
        display(pd.DataFrame(rows))
    except Exception:
        print(rows)

print("Loaders & chunker ready. Next cell will embed + upsert in batches.")
# -----------------------------------------------------------------------------


Loaders & chunker ready. Next cell will embed + upsert in batches.


In [52]:
# --- Cell 4: Embed & upsert to Pinecone (batched) + rebuild helpers ----------
from typing import List, Dict, Any, Iterable, Optional
from tqdm import tqdm
import time
import math
import traceback

BATCH_SIZE = int(os.getenv("BATCH_SIZE", "64"))
UPSERT_NAMESPACE = PINECONE_NS

def _batched(iterable: Iterable[Any], n: int) -> Iterable[List[Any]]:
    batch = []
    for x in iterable:
        batch.append(x)
        if len(batch) >= n:
            yield batch
            batch = []
    if batch:
        yield batch

def _prepare_vectors(chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """Turn chunk records into Pinecone vectors with embeddings."""
    texts = [c["text"] for c in chunks]
    embs = embed_texts(texts)
    vectors = []
    for c, v in zip(chunks, embs):
        # store text + source + small meta for downstream retrieval
        meta = dict(c["metadata"])
        meta.update({
            "text": c["text"],
            "source_stamp": c["source_stamp"],
        })
        vectors.append({
            "id": c["id"],
            "values": v,
            "metadata": meta,
        })
    return vectors

def upsert_chunks(chunks: List[Dict[str, Any]],
                  index_handle,
                  namespace: Optional[str] = None,
                  batch_size: int = BATCH_SIZE,
                  max_retries: int = 5) -> int:
    """Embed and upsert chunks with retry/backoff. Returns total upserted."""
    total = 0
    namespace = namespace or UPSERT_NAMESPACE
    for batch in tqdm(list(_batched(chunks, batch_size)), desc="Upserting"):
        # 1) embed
        vectors = _prepare_vectors(batch)

        # 2) upsert with retries
        attempt, backoff = 0, 1.0
        while True:
            try:
                index_handle.upsert(vectors=vectors, namespace=namespace)
                total += len(vectors)
                break
            except Exception as e:
                attempt += 1
                if attempt > max_retries:
                    print("❌ Upsert failed after retries. Last error:\n", e)
                    traceback.print_exc(limit=1)
                    break
                time.sleep(backoff)
                backoff *= 2
    return total

def build_index(data_dir: pathlib.Path = DATA_DIR,
                namespace: Optional[str] = None,
                batch_size: int = BATCH_SIZE) -> Dict[str, Any]:
    """Full pipeline: load → chunk → embed+upsert. Returns summary."""
    ns = namespace or UPSERT_NAMESPACE
    print(f"Building index '{PINECONE_INDEX}' namespace='{ns}' from: {data_dir}")

    raw = load_corpus(data_dir)
    chunks = chunk_documents(raw)
    count = upsert_chunks(chunks, index, namespace=ns, batch_size=batch_size)

    summary = {
        "index": PINECONE_INDEX,
        "namespace": ns,
        "files_seen": len({d['meta']['path'] for d in raw}),
        "raw_items": len(raw),
        "chunks_upserted": count,
        "embed_model": EMBED_MODEL,
        "dim": embedding_info()["dimension"],
    }
    print("✅ Build complete:", summary)
    return summary

def wipe_namespace(namespace: Optional[str] = None):
    """Deletes all vectors in a namespace (careful!)."""
    ns = namespace or UPSERT_NAMESPACE
    confirm = True  # flip to a prompt in interactive use if you prefer
    if confirm:
        print(f"⚠️ Deleting all vectors in index='{PINECONE_INDEX}', namespace='{ns}' ...")
        index.delete(delete_all=True, namespace=ns)
        print("✅ Namespace wiped.")

print("Embed & upsert utilities ready. Next cell will add the search API + LangChain tool wrapper.")
# -----------------------------------------------------------------------------


Embed & upsert utilities ready. Next cell will add the search API + LangChain tool wrapper.


In [53]:
# --- Cell 4a: Pinecone metadata sanitizer (fix nulls like json_path=None) ---
from typing import Dict, Any, List

def _pc_clean_meta(meta: Dict[str, Any]) -> Dict[str, Any]:
    clean: Dict[str, Any] = {}
    for k, v in meta.items():
        if v is None:
            # Drop nulls entirely (or set "" if you prefer)
            continue
        if isinstance(v, (str, int, float, bool)):
            clean[k] = v
        elif isinstance(v, list):
            # Pinecone allows list of strings — coerce items to str, skip None
            clean[k] = [str(x) for x in v if x is not None]
        else:
            # Fallback: stringify complex types
            clean[k] = str(v)
    return clean

# Override _prepare_vectors to use the sanitizer
def _prepare_vectors(chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    texts = [c["text"] for c in chunks]
    embs = embed_texts(texts)
    vectors = []
    for c, v in zip(chunks, embs):
        meta = dict(c["metadata"])
        meta.update({
            "text": c["text"],
            "source_stamp": c["source_stamp"],
        })
        meta = _pc_clean_meta(meta)
        vectors.append({
            "id": c["id"],
            "values": v,
            "metadata": meta,
        })
    return vectors

print("✅ Patched _prepare_vectors with Pinecone-safe metadata (nulls removed).")


✅ Patched _prepare_vectors with Pinecone-safe metadata (nulls removed).


In [54]:
# --- Cell 5: Search (top-k) + LangChain tool wrapper -------------------------
from typing import Dict, Any, List, Optional, Union
from langchain.tools import Tool

DEFAULT_TOP_K = int(os.getenv("TOP_K", "5"))

def _normalize_match(m: Dict[str, Any]) -> Dict[str, Any]:
    """
    Pinecone v7 .query() returns matches with fields:
      id, score, metadata (and maybe values if requested).
    We map to the contract: {text, source_stamp, score, id}
    """
    meta = m.get("metadata", {}) or {}
    return {
        "id": m.get("id"),
        "score": m.get("score"),
        "text": meta.get("text", ""),
        "source_stamp": meta.get("source_stamp", meta.get("path", "")),
    }

def semantic_search(query: str,
                    top_k: int = DEFAULT_TOP_K,
                    namespace: Optional[str] = None,
                    metadata_filter: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
    """
    Runs an embedding search in Pinecone and returns normalized passages.
    """
    if not query or not query.strip():
        return []

    ns = namespace or UPSERT_NAMESPACE
    vec = embed_texts([query])[0]

    res = index.query(
        namespace=ns,
        vector=vec,
        top_k=top_k,
        include_values=False,
        include_metadata=True,
        filter=metadata_filter or None,
    )

    matches = res.get("matches", []) if isinstance(res, dict) else getattr(res, "matches", [])
    return [_normalize_match(m) for m in matches]

def rag_search(args: Union[str, Dict[str, Any]]) -> Dict[str, Any]:
    """
    LangChain-tool-compatible callable.
    Accepts either:
      - a plain query string, or
      - a dict with keys:
          query (str, required),
          top_k (int, optional),
          namespace (str, optional),
          filter (dict, optional metadata filter)
    Returns:
      {"data": [{"text":..., "source_stamp":..., "score":..., "id":...}, ...]}
    """
    if isinstance(args, str):
        query, top_k, ns, filt = args, DEFAULT_TOP_K, UPSERT_NAMESPACE, None
    else:
        query = args.get("query", "")
        top_k = int(args.get("top_k", DEFAULT_TOP_K))
        ns    = args.get("namespace", UPSERT_NAMESPACE)
        filt  = args.get("filter")

    results = semantic_search(query=query, top_k=top_k, namespace=ns, metadata_filter=filt)
    return {"data": results}

# Register as a LangChain Tool (ready for pipeline use)
rag_search_tool = Tool(
    name="rag_search",
    description=(
        "Semantic search over the Pinecone index. "
        "Args can be a string query or a JSON dict with keys "
        "{query, top_k?, namespace?, filter?}. Returns passages "
        "as {data: [{text, source_stamp, score, id}, ...]}."
    ),
    func=rag_search,
)

print("Search API ready. Next cell will show an end-to-end demo: build → search.")
# -----------------------------------------------------------------------------


Search API ready. Next cell will show an end-to-end demo: build → search.


In [37]:
# --- Cell 5a: MMR reranker + rag_search(rerank=...) --------------------------
from typing import List, Dict, Any, Optional, Union
import numpy as np

def _cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
    denom = (np.linalg.norm(a) * np.linalg.norm(b))
    if denom == 0:
        return 0.0
    return float(np.dot(a, b) / denom)

def _mmr_rerank(query_vec: np.ndarray,
                cand_vecs: np.ndarray,
                lambda_mult: float = 0.7,
                top_k: int = 5) -> List[int]:
    """
    Classic MMR (Carbonell & Goldstein, 1998):
    Select items that maximize: λ * sim(candidate, query) - (1-λ) * max_sim(candidate, selected)
    Returns indices of selected candidates in order.
    """
    n = cand_vecs.shape[0]
    if n == 0:
        return []
    top_k = min(top_k, n)

    # Precompute similarities to query
    qnorm = np.linalg.norm(query_vec)
    cnorms = np.linalg.norm(cand_vecs, axis=1)
    # Avoid division by zero
    qnorm = 1e-9 if qnorm == 0 else qnorm
    cnorms = np.where(cnorms == 0, 1e-9, cnorms)
    sim_to_query = (cand_vecs @ query_vec) / (cnorms * qnorm)

    selected: List[int] = []
    remaining = set(range(n))

    while len(selected) < top_k and remaining:
        if not selected:
            # pick the best by query similarity
            i = int(np.argmax(sim_to_query))
            selected.append(i)
            remaining.remove(i)
            continue

        # compute diversity term: for each candidate, its max similarity to any already selected
        sel_vecs = cand_vecs[selected]
        # cosine sim with each selected, take max along selected axis
        sims_matrix = (cand_vecs @ sel_vecs.T) / (cnorms[:, None] * np.linalg.norm(sel_vecs, axis=1)[None, :])
        max_sim_to_selected = np.max(sims_matrix, axis=1)

        # score = λ * sim(q) - (1-λ) * max_sim(selected)
        mmr_scores = lambda_mult * sim_to_query - (1.0 - lambda_mult) * max_sim_to_selected

        # ignore already selected by setting to -inf
        mmr_scores[list(selected)] = -np.inf
        # pick the best remaining
        i = int(np.argmax(mmr_scores))
        if i in remaining:
            selected.append(i)
            remaining.remove(i)
        else:
            # fallback: pick any remaining with highest sim_to_query
            i = max(list(remaining), key=lambda j: sim_to_query[j])
            selected.append(i)
            remaining.remove(i)

    return selected

def semantic_search_reranked(query: str,
                             top_k: int = DEFAULT_TOP_K,
                             namespace: Optional[str] = None,
                             metadata_filter: Optional[Dict[str, Any]] = None,
                             fetch_k: Optional[int] = None,
                             lambda_mult: float = 0.7) -> List[Dict[str, Any]]:
    """
    1) Query Pinecone for a larger candidate set (fetch_k).
    2) Embed the query + candidate texts with the hosted model.
    3) Apply MMR to promote relevance and diversity.
    4) Return top_k passages in the new order.
    """
    if not query or not query.strip():
        return []

    ns = namespace or UPSERT_NAMESPACE
    fetch_k = fetch_k or max(top_k * 3, top_k)

    # (a) Initial ANN retrieve
    q_vec = embed_texts([query])[0]
    res = index.query(
        namespace=ns,
        vector=q_vec,
        top_k=fetch_k,
        include_values=False,
        include_metadata=True,
        filter=metadata_filter or None,
    )
    matches = res.get("matches", []) if isinstance(res, dict) else getattr(res, "matches", [])
    if not matches:
        return []

    # (b) Embed candidate texts (use the text stored in metadata)
    cand_texts = [ (m.get("metadata", {}) or {}).get("text", "") for m in matches ]
    # guard empty strings
    pairs = [(i, t) for i, t in enumerate(cand_texts) if t.strip()]
    if not pairs:
        return []

    idxs, texts = zip(*pairs)
    cand_vecs = np.array(embed_texts(list(texts)), dtype=np.float32)

    # (c) Run MMR on the subset with text; map back to original match indices
    selected_local = _mmr_rerank(np.array(q_vec, dtype=np.float32), cand_vecs,
                                 lambda_mult=lambda_mult, top_k=min(top_k, len(idxs)))
    selected_global = [idxs[i] for i in selected_local]

    # (d) Normalize to the desired output schema
    def _norm(m):
        meta = m.get("metadata", {}) or {}
        return {
            "id": m.get("id"),
            "score": m.get("score"),
            "text": meta.get("text", ""),
            "source_stamp": meta.get("source_stamp", meta.get("path", "")),
        }

    return [_norm(matches[i]) for i in selected_global]

# Upgrade rag_search to accept rerank
def rag_search(args: Union[str, Dict[str, Any]]) -> Dict[str, Any]:
    """
    Accepts string or dict.
    Dict keys:
      query (str, required)
      top_k (int, optional)
      namespace (str, optional)
      filter (dict, optional)
      rerank (bool, optional)           # <--- new
      fetch_k (int, optional)           # <--- candidate pool for rerank
      lambda_mult (float, optional)     # <--- MMR lambda (0..1), higher = more relevance
    """
    if isinstance(args, str):
        query = args
        opts = {}
    else:
        query = args.get("query", "")
        opts = args

    top_k       = int(opts.get("top_k", DEFAULT_TOP_K))
    ns          = opts.get("namespace", UPSERT_NAMESPACE)
    filt        = opts.get("filter")
    rerank      = bool(opts.get("rerank", False))
    fetch_k     = opts.get("fetch_k")
    lambda_mult = float(opts.get("lambda_mult", 0.7))

    if rerank:
        data = semantic_search_reranked(query=query, top_k=top_k, namespace=ns,
                                        metadata_filter=filt, fetch_k=fetch_k,
                                        lambda_mult=lambda_mult)
    else:
        data = semantic_search(query=query, top_k=top_k, namespace=ns, metadata_filter=filt)

    return {"data": data}

print("✅ MMR reranker ready. Use rag_search({... 'rerank': True, 'fetch_k': 20, 'lambda_mult': 0.7}).")
# -----------------------------------------------------------------------------


✅ MMR reranker ready. Use rag_search({... 'rerank': True, 'fetch_k': 20, 'lambda_mult': 0.7}).


In [55]:
# --- Cell 6: Demo — build index then run a few searches ----------------------
# If you haven’t added files yet, drop .txt/.json into DATA_DIR first, then re-run.

print("DATA_DIR:", DATA_DIR)
print("Index:", PINECONE_INDEX, "| Namespace:", UPSERT_NAMESPACE)
print("Embedding:", embedding_info())

# 1) Build (load → chunk → embed → upsert)
summary = build_index(DATA_DIR, namespace=UPSERT_NAMESPACE)
summary


DATA_DIR: e:\capitalone\CapitalOne-hack\fasal-setu-ai\py\ai_engine\tools\ragdata
Index: capitalone | Namespace: default
Embedding: {'model': 'llama-text-embed-v2', 'dimension': 1024, 'metric': 'cosine'}
Building index 'capitalone' namespace='default' from: e:\capitalone\CapitalOne-hack\fasal-setu-ai\py\ai_engine\tools\ragdata
Loaded 2 raw doc items (2 txt, 0 json-slices).
Chunked into 12 chunks (size≈1000, overlap=120).


Upserting: 100%|██████████| 1/1 [00:03<00:00,  3.66s/it]

✅ Build complete: {'index': 'capitalone', 'namespace': 'default', 'files_seen': 2, 'raw_items': 2, 'chunks_upserted': 12, 'embed_model': 'llama-text-embed-v2', 'dim': 1024}





{'index': 'capitalone',
 'namespace': 'default',
 'files_seen': 2,
 'raw_items': 2,
 'chunks_upserted': 12,
 'embed_model': 'llama-text-embed-v2',
 'dim': 1024}

In [57]:
# --- Pretty print for rag_search results (fixes NameError: textwrap) ---------
import textwrap

def show_results(results, n=5, width=300):
    data = results.get("data", []) if isinstance(results, dict) else results
    if not data:
        print("No results.")
        return
    for i, r in enumerate(data[:n], 1):
        score = r.get("score")
        score_str = f"{score:.4f}" if isinstance(score, (int, float)) else str(score)
        txt = r.get("text", "") or ""
        oneline = " ".join(txt.split())
        snippet = textwrap.shorten(oneline, width=width, placeholder="...")
        print(f"\n#{i}  score={score_str}\nsource={r.get('source_stamp')}\ntext  : {snippet}")


In [59]:
res = rag_search({
    "query": "maize in gondhiya",
    "top_k": 5,
    "rerank": True,
    "fetch_k": 24,
    "lambda_mult": 0.7
})
show_results(res, n=5, width=300)



#1  score=0.3564
source=ragdata/cropinfogondhiyaICAR.txt
text  : Livestock and fodder measures: Reserve fodder from crop residue; grow sorghum, bajra, maize for green fodder; fodder banks at village level; harvest top fodder from trees (Leucaena, Glyricidia); urea molasses treatment of straw.

#2  score=0.3396
source=ragdata/ICAR_CRIDA_DOC_Bihar.txt
text  : 1.1 Agro-Climatic/Ecological Zone Satpura range and Wainganga Valley, hot moist subhumid ESR with shallow to deep loamy to clayey mixed Red and Black soils, low to medium AWC and LGP 180-210 days. 1.7 Area under major field crops Kharif/Rabi/Summer: Paddy 190.9ha, Soybean 9.2ha, Pigeonpea 4.2ha,...

#3  score=0.3138
source=ragdata/ICAR_CRIDA_DOC_Bihar.txt
text  : 1.1 Agro-climatic zone Eastern Vidarbha zone, hot sub-humid, dry; soils deep black (~79%), medium black (~8%), shallow black (~13%). Main Kharif crops: Paddy (~205,000ha), pigeon pea, sesame. Sowing windows: Kharif paddy (rainfed) early July, irrigated early June, Gram/Whe

In [47]:
index.delete(delete_all=True, namespace="default")

{}