In [1]:
# Jupyter-ready demo: Compare Similarity vs MMR vs Compression vs MMR+Compression
#
# This notebook builds a tiny, self-contained retrieval sandbox
# (no internet, no external models). It uses a simple hashed bag-of-words
# embedding to compute cosine similarity. MMR is implemented from scratch,
# and "contextual compression" is simulated by keeping only query-relevant
# sentences from each retrieved chunk.
#
# Edit the QUERY, DOCS, and parameters and re-run.

from typing import List, Tuple, Dict
import math
import re
import numpy as np
import pandas as pd
from textwrap import shorten
from itertools import islice

In [2]:
# If you're running in ChatGPT, the following import displays a nice table UI.
# In a local Jupyter environment, you can just rely on df.head() or print(df).
try:
    from caas_jupyter_tools import display_dataframe_to_user
    DISPLAY_TO_USER = True
except Exception:
    DISPLAY_TO_USER = False

np.random.seed(0)

In [3]:
# --------------------------
# 1) Toy corpus & query
# --------------------------

QUERY = "best budget laptop with long battery for students under $600"
DOCS = [
    "The XNote 13 offers premium aluminum build, OLED display, and excellent keyboard. Price: $1199. Battery life averages 7 hours.",
    "If you want a budget laptop for college, the StudyBook 14 has Intel i5, 8GB RAM, 256GB SSD, and up to 12 hours of battery. Street price hovers near $549.",
    "The GamePro 16 is a gaming powerhouse (RTX 4070) with 144Hz screen. Heavy at 2.6kg, battery lasts about 4 hours. Price around $1499.",
    "For students under $600, the LearnMate 15 features Ryzen 5, solid build, and strong 10-hour battery life if you keep brightness at 60%.",
    "Ultrabook Air is ultra-light at 1.0kg, silent fanless design, but costs $999. Battery life rated at 9 hours for web browsing.",
    "Budget choice: EduLite 13 with 8GB RAM, 128GB storage, and 15-hour battery in light workloads. Often discounted to $499 during back-to-school.",
    "Convertible 2-in-1 FlexStudy 14: touch screen, pen support, and 11-hour battery. Occasionally dips to $599 on sale.",
    "WorkMate Pro 15 targets professionals; great ports and 8-hour battery. The price is $899, so may exceed a strict student budget.",
    "Chromebook Class 11 is very affordable at $219; simple tasks only. Battery: 10 hours, but limited offline apps.",
    "Creator 15 OLED is perfect for media editing with color-accurate panel. Battery is 6 hours. Price around $1399."
]

In [4]:
# --------------------------
# 2) Hashed bag-of-words embeddings
# --------------------------

def tokenize(text: str) -> List[str]:
    return re.findall(r"[a-zA-Z0-9$]+", text.lower())

def hashed_bow_embedding(text: str, dim: int = 512) -> np.ndarray:
    vec = np.zeros(dim, dtype=np.float32)
    for tok in tokenize(text):
        # Map token to [0, dim)
        idx = (hash(tok) % dim + dim) % dim
        vec[idx] += 1.0
    # L2 normalize
    norm = np.linalg.norm(vec) + 1e-9
    return vec / norm

EMB_DIM = 512
doc_embeddings = [hashed_bow_embedding(d, EMB_DIM) for d in DOCS]
query_embedding = hashed_bow_embedding(QUERY, EMB_DIM)

In [5]:
# --------------------------
# 3) Similarity search (cosine)
# --------------------------

def cosine(a: np.ndarray, b: np.ndarray) -> float:
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-9))

def similarity_search(
    query_vec: np.ndarray,
    doc_vecs: List[np.ndarray],
    k: int = 5,
) -> List[Tuple[int, float]]:
    scores = [(i, cosine(query_vec, v)) for i, v in enumerate(doc_vecs)]
    return sorted(scores, key=lambda x: x[1], reverse=True)[:k]

In [6]:
# --------------------------
# 4) MMR (Maximal Marginal Relevance)
# --------------------------
# MMR balances relevance to the query and diversity among selected docs.
# lambda_mult close to 1.0 emphasizes relevance; close to 0.0 emphasizes diversity.

def mmr(
    query_vec: np.ndarray,
    doc_vecs: List[np.ndarray],
    k: int = 5,
    lambda_mult: float = 0.7,
    fetch_k: int = 20,
) -> List[Tuple[int, float]]:
    # Candidate pool: top fetch_k by cosine
    pool = similarity_search(query_vec, doc_vecs, k=min(fetch_k, len(doc_vecs)))
    candidates = [idx for idx, _ in pool]
    selected: List[int] = []
    # Precompute similarities
    q_sims = {i: cosine(query_vec, doc_vecs[i]) for i in candidates}
    doc_sims = {(i, j): cosine(doc_vecs[i], doc_vecs[j]) for i in candidates for j in candidates}
    while len(selected) < min(k, len(candidates)):
        best_score = -1.0
        best_idx = None
        for i in candidates:
            if i in selected:
                continue
            redundancy = 0.0 if not selected else max(doc_sims[(i, j)] for j in selected)
            score = lambda_mult * q_sims[i] - (1 - lambda_mult) * redundancy
            if score > best_score:
                best_score = score
                best_idx = i
        if best_idx is None:
            break
        selected.append(best_idx)
    # Return with an interpretable score component
    return [(i, lambda_mult * q_sims[i]) for i in selected]

In [7]:
# --------------------------
# 5) Contextual compression (simulated)
# --------------------------
# A lightweight stand-in for LLM-driven compression:
# Keep only sentences that contain query keywords (minus stopwords).

STOP = set("""a an the and or if with to for of on in at is are was were be been it this that those these""".split())

def keywords(text: str) -> set:
    toks = set(tokenize(text))
    return {t for t in toks if t not in STOP}

def compress_text(text: str, query: str) -> str:
    q = keywords(query)
    # Split into pseudo-sentences by punctuation
    sentences = re.split(r"(?<=[\.\!\?])\s+", text.strip())
    kept = []
    for s in sentences:
        s_words = keywords(s)
        overlap = len(q & s_words)
        if overlap > 0:
            kept.append(s)
    if not kept:
        kept = [sentences[0]] if sentences else []
    return " ".join(kept)

In [8]:
# --------------------------
# 6) Pipelines to compare
# --------------------------

def run_pipeline(method: str, k: int = 5, lambda_mult: float = 0.7, fetch_k: int = 20):
    results = []
    if method == "similarity":
        hits = similarity_search(query_embedding, doc_embeddings, k=k)
        for rank, (idx, score) in enumerate(hits, start=1):
            results.append({
                "method": "similarity",
                "rank": rank,
                "doc_id": idx,
                "score": round(score, 4),
                "text": DOCS[idx],
                "compressed_text": None
            })
    elif method == "mmr":
        hits = mmr(query_embedding, doc_embeddings, k=k, lambda_mult=lambda_mult, fetch_k=fetch_k)
        for rank, (idx, score) in enumerate(hits, start=1):
            results.append({
                "method": f"mmr(λ={lambda_mult})",
                "rank": rank,
                "doc_id": idx,
                "score": round(score, 4),
                "text": DOCS[idx],
                "compressed_text": None
            })
    elif method == "compression_only":
        hits = similarity_search(query_embedding, doc_embeddings, k=k)
        for rank, (idx, score) in enumerate(hits, start=1):
            comp = compress_text(DOCS[idx], QUERY)
            results.append({
                "method": "compression(similarity)",
                "rank": rank,
                "doc_id": idx,
                "score": round(score, 4),
                "text": DOCS[idx],
                "compressed_text": comp
            })
    elif method == "mmr_plus_compression":
        hits = mmr(query_embedding, doc_embeddings, k=k, lambda_mult=lambda_mult, fetch_k=fetch_k)
        for rank, (idx, score) in enumerate(hits, start=1):
            comp = compress_text(DOCS[idx], QUERY)
            results.append({
                "method": f"compression(mmr λ={lambda_mult})",
                "rank": rank,
                "doc_id": idx,
                "score": round(score, 4),
                "text": DOCS[idx],
                "compressed_text": comp
            })
    else:
        raise ValueError("Unknown method")
    return results


In [9]:
# --------------------------
# 7) Run all modes & assemble a comparison table
# --------------------------

K = 5
LAMBDA = 0.7
FETCH_K = 20

all_rows = []
for m in ["similarity", "mmr", "compression_only", "mmr_plus_compression"]:
    all_rows.extend(run_pipeline(m, k=K, lambda_mult=LAMBDA, fetch_k=FETCH_K))

df = pd.DataFrame(all_rows)

# Create short previews to make the table readable
def preview(s: str) -> str:
    return shorten(s, width=140, placeholder="…")

df["preview_text"] = df["text"].apply(preview)
df["preview_compressed"] = df["compressed_text"].apply(lambda s: "" if s is None else preview(s))

# Order columns for clarity
df = df[["method", "rank", "doc_id", "score", "preview_text", "preview_compressed"]]

if DISPLAY_TO_USER:
    display_dataframe_to_user("Retrieval Methods Comparison", df)
else:
    # Fallback display
    from IPython.display import display
    display(df)
    
print("Tips:")
print(" - Edit QUERY and DOCS at the top and re-run to see how rankings change.")
print(" - Try different λ (LAMBDA) for MMR (e.g., 0.9 vs 0.3) to feel relevance vs diversity trade-offs.")
print(" - Compression keeps only sentences overlapping with query keywords (stand-in for LLM-based filters).")
print(" - Compare 'similarity' vs 'mmr' ranks, and how 'compressed' previews get tighter.")

Unnamed: 0,method,rank,doc_id,score,preview_text,preview_compressed
0,similarity,1,3,0.3227,"For students under $600, the LearnMate 15 feat...",
1,similarity,2,1,0.2309,"If you want a budget laptop for college, the S...",
2,similarity,3,9,0.207,Creator 15 OLED is perfect for media editing w...,
3,similarity,4,7,0.1936,WorkMate Pro 15 targets professionals; great p...,
4,similarity,5,5,0.1732,"Budget choice: EduLite 13 with 8GB RAM, 128GB ...",
5,mmr(λ=0.7),1,3,0.2259,"For students under $600, the LearnMate 15 feat...",
6,mmr(λ=0.7),2,9,0.1449,Creator 15 OLED is perfect for media editing w...,
7,mmr(λ=0.7),3,1,0.1617,"If you want a budget laptop for college, the S...",
8,mmr(λ=0.7),4,8,0.1043,Chromebook Class 11 is very affordable at $219...,
9,mmr(λ=0.7),5,5,0.1212,"Budget choice: EduLite 13 with 8GB RAM, 128GB ...",


Tips:
 - Edit QUERY and DOCS at the top and re-run to see how rankings change.
 - Try different λ (LAMBDA) for MMR (e.g., 0.9 vs 0.3) to feel relevance vs diversity trade-offs.
 - Compression keeps only sentences overlapping with query keywords (stand-in for LLM-based filters).
 - Compare 'similarity' vs 'mmr' ranks, and how 'compressed' previews get tighter.
