In [1]:
!pip -q install langchain langchain-community langchain-text-splitters
!pip -q install sentence-transformers faiss-cpu
!pip -q install pypdf

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.0/51.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests==2.32.4, but you have requests 2.32.5 which is incompatible.[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m64.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.6/329.6 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[

In [2]:
import langchain
import faiss
from sentence_transformers import SentenceTransformer

print("LangChain OK")
print("FAISS OK")
print("SentenceTransformers OK")


LangChain OK
FAISS OK
SentenceTransformers OK


In [3]:
import os

os.makedirs("data", exist_ok=True)

with open("data/policy_refunds.txt", "w") as f:
    f.write(
        "Refund Policy:\n"
        "Refunds are available within 30 days of purchase with a valid receipt.\n"
        "Digital goods are non-refundable unless required by law.\n"
    )

with open("data/policy_shipping.txt", "w") as f:
    f.write(
        "Shipping Policy:\n"
        "Standard shipping takes 3-5 business days.\n"
        "Expedited shipping takes 1-2 business days.\n"
        "International shipping times vary by destination.\n"
    )

print("Created sample docs in ./data")


Created sample docs in ./data


In [4]:
from langchain_community.document_loaders import TextLoader
from langchain_core.documents import Document

docs = []
for filename in os.listdir("data"):
    if filename.endswith(".txt"):
        path = os.path.join("data", filename)
        loader = TextLoader(path, encoding="utf-8")
        docs.extend(loader.load())

print("Loaded documents:", len(docs))
print("First doc preview:\n", docs[0].page_content[:200])


Loaded documents: 2
First doc preview:
 Shipping Policy:
Standard shipping takes 3-5 business days.
Expedited shipping takes 1-2 business days.
International shipping times vary by destination.



In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=50,
)

chunks = text_splitter.split_documents(docs)

print("Total chunks:", len(chunks))
print("Sample chunk:\n", chunks[0].page_content)


Total chunks: 2
Sample chunk:
 Shipping Policy:
Standard shipping takes 3-5 business days.
Expedited shipping takes 1-2 business days.
International shipping times vary by destination.


In [6]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",
    encode_kwargs={"normalize_embeddings": True},
)

vectorstore = FAISS.from_documents(chunks, embedding_model)

print("Vectorstore created with", len(chunks), "chunks.")


  embedding_model = HuggingFaceEmbeddings(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Vectorstore created with 2 chunks.


In [7]:
query = "What is the refund window?"
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

results = retriever.invoke(query)

for i, doc in enumerate(results, 1):
    print(f"\n--- Result {i} ---")
    print(doc.page_content)



--- Result 1 ---
Refund Policy:
Refunds are available within 30 days of purchase with a valid receipt.
Digital goods are non-refundable unless required by law.

--- Result 2 ---
Shipping Policy:
Standard shipping takes 3-5 business days.
Expedited shipping takes 1-2 business days.
International shipping times vary by destination.


In [8]:
!pip -q install transformers accelerate


In [9]:
from transformers import pipeline
from langchain_community.llms import HuggingFacePipeline
from langchain_core.prompts import ChatPromptTemplate

hf_pipe = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    max_new_tokens=200,
)

llm = HuggingFacePipeline(pipeline=hf_pipe)

prompt = ChatPromptTemplate.from_template(
    """You are a helpful assistant. Use ONLY the context below to answer.
If the answer is not in the context, say "I don't know."

Context:
{context}

Question:
{question}

Answer:"""
)


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0
  llm = HuggingFacePipeline(pipeline=hf_pipe)


In [10]:
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
    return "\n\n".join(d.page_content for d in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
)

answer = rag_chain.invoke("What is the refund window?")
print(answer)


30 days


In [11]:
def rag_with_sources(question: str, k: int = 3):
    retrieved_docs = vectorstore.similarity_search(question, k=k)
    context = format_docs(retrieved_docs)

    response = rag_chain.invoke(question)

    sources = []
    for d in retrieved_docs:
        sources.append({
            "source": d.metadata.get("source", "unknown"),
            "snippet": d.page_content[:160]
        })

    return {"answer": response, "sources": sources}

out = rag_with_sources("What is the refund window?", k=3)
print("ANSWER:\n", out["answer"])
print("\nSOURCES:")
for s in out["sources"]:
    print("-", s["source"], "|", s["snippet"])


ANSWER:
 30 days

SOURCES:
- data/policy_refunds.txt | Refund Policy:
Refunds are available within 30 days of purchase with a valid receipt.
Digital goods are non-refundable unless required by law.
- data/policy_shipping.txt | Shipping Policy:
Standard shipping takes 3-5 business days.
Expedited shipping takes 1-2 business days.
International shipping times vary by destination.


In [12]:
vectorstore.save_local("faiss_index")
print("Saved FAISS index to ./faiss_index")


Saved FAISS index to ./faiss_index


In [13]:
loaded_vs = FAISS.load_local(
    "faiss_index",
    embedding_model,
    allow_dangerous_deserialization=True
)

loaded_retriever = loaded_vs.as_retriever(search_kwargs={"k": 3})
results = loaded_retriever.invoke("What is the refund window?")

print("Loaded index retrieval results:", len(results))
print(results[0].page_content)


Loaded index retrieval results: 2
Refund Policy:
Refunds are available within 30 days of purchase with a valid receipt.
Digital goods are non-refundable unless required by law.


In [14]:
!pip -q install rank-bm25


In [15]:
from langchain_community.retrievers import BM25Retriever

bm25 = BM25Retriever.from_documents(chunks)
bm25.k = 3

bm25_results = bm25.invoke("What is the refund window?")
print("BM25 results:", len(bm25_results))
print(bm25_results[0].page_content)


BM25 results: 2
Refund Policy:
Refunds are available within 30 days of purchase with a valid receipt.
Digital goods are non-refundable unless required by law.


In [16]:
def hybrid_retrieve(query: str, k_dense: int = 3, k_bm25: int = 3):
    dense_docs = vectorstore.similarity_search(query, k=k_dense)
    bm25_docs = bm25.invoke(query)[:k_bm25]

    # de-duplicate by text content (simple + works well for demos)
    seen = set()
    merged = []
    for d in dense_docs + bm25_docs:
        key = d.page_content
        if key not in seen:
            seen.add(key)
            merged.append(d)
    return merged

hyb = hybrid_retrieve("What is the refund window?")
print("Hybrid results:", len(hyb))
for i, d in enumerate(hyb, 1):
    print(f"\n--- Hybrid {i} ---")
    print(d.page_content)


Hybrid results: 2

--- Hybrid 1 ---
Refund Policy:
Refunds are available within 30 days of purchase with a valid receipt.
Digital goods are non-refundable unless required by law.

--- Hybrid 2 ---
Shipping Policy:
Standard shipping takes 3-5 business days.
Expedited shipping takes 1-2 business days.
International shipping times vary by destination.


In [17]:
from langchain_core.runnables import RunnableLambda, RunnablePassthrough

hybrid_runnable = RunnableLambda(lambda q: hybrid_retrieve(q, k_dense=3, k_bm25=3))

hybrid_rag_chain = (
    {"context": hybrid_runnable | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
)

print(hybrid_rag_chain.invoke("What is the refund window?"))


30 days


In [18]:
!pip -q install sentence-transformers


In [19]:
from sentence_transformers import CrossEncoder

reranker = CrossEncoder("BAAI/bge-reranker-base")

def rerank(query: str, docs, top_n: int = 3):
    pairs = [(query, d.page_content) for d in docs]
    scores = reranker.predict(pairs)

    scored = list(zip(docs, scores))
    scored.sort(key=lambda x: x[1], reverse=True)
    return scored[:top_n]

candidates = hybrid_retrieve("What is the refund window?", k_dense=5, k_bm25=5)
top = rerank("What is the refund window?", candidates, top_n=3)

for i, (d, s) in enumerate(top, 1):
    print(f"\n--- Reranked {i} | score={s:.4f} ---")
    print(d.page_content)


config.json:   0%|          | 0.00/799 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]


--- Reranked 1 | score=0.0262 ---
Refund Policy:
Refunds are available within 30 days of purchase with a valid receipt.
Digital goods are non-refundable unless required by law.

--- Reranked 2 | score=0.0002 ---
Shipping Policy:
Standard shipping takes 3-5 business days.
Expedited shipping takes 1-2 business days.
International shipping times vary by destination.


In [20]:
def enterprise_retrieve(query: str, k_dense=5, k_bm25=5, k_final=3):
    # 1. Hybrid retrieval
    candidates = hybrid_retrieve(query, k_dense=k_dense, k_bm25=k_bm25)

    # 2. Rerank
    reranked = rerank(query, candidates, top_n=k_final)

    # 3. Return Documents only (strip scores)
    return [doc for doc, _ in reranked]


In [21]:
enterprise_runnable = RunnableLambda(enterprise_retrieve)

enterprise_rag_chain = (
    {"context": enterprise_runnable | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
)

print(enterprise_rag_chain.invoke("What is the refund window?"))


30 days


In [22]:
import json, os

eval_set = [
    {
        "id": "refund_1",
        "question": "What is the refund window?",
        "relevant_sources": ["data/policy_refunds.txt"]
    },
    {
        "id": "shipping_1",
        "question": "How long does standard shipping take?",
        "relevant_sources": ["data/policy_shipping.txt"]
    },
    {
        "id": "shipping_2",
        "question": "How long does expedited shipping take?",
        "relevant_sources": ["data/policy_shipping.txt"]
    }
]

os.makedirs("eval", exist_ok=True)
with open("eval/eval.json", "w") as f:
    json.dump(eval_set, f, indent=2)

print("Saved eval dataset to eval/eval.json with", len(eval_set), "questions.")


Saved eval dataset to eval/eval.json with 3 questions.


In [23]:
def retrieve_sources(question: str, k: int = 5):
    docs = enterprise_retrieve(question, k_dense=5, k_bm25=5, k_final=k)
    return [d.metadata.get("source", "unknown") for d in docs]

# quick sanity check
print(retrieve_sources("What is the refund window?", k=3))


['data/policy_refunds.txt', 'data/policy_shipping.txt']


In [24]:
import math

def precision_at_k(retrieved, relevant_set, k):
    topk = retrieved[:k]
    hits = sum(1 for x in topk if x in relevant_set)
    return hits / k

def reciprocal_rank(retrieved, relevant_set):
    for i, x in enumerate(retrieved, start=1):
        if x in relevant_set:
            return 1.0 / i
    return 0.0

def ndcg_at_k(retrieved, relevant_set, k):
    # binary relevance: rel=1 if in relevant_set else 0
    def dcg(items):
        score = 0.0
        for i, x in enumerate(items, start=1):
            rel = 1.0 if x in relevant_set else 0.0
            score += rel / math.log2(i + 1)
        return score

    topk = retrieved[:k]
    dcg_val = dcg(topk)

    # ideal ranking = all relevant first
    ideal = list(relevant_set) + [x for x in topk if x not in relevant_set]
    idcg_val = dcg(ideal[:k])

    return (dcg_val / idcg_val) if idcg_val > 0 else 0.0


In [25]:
with open("eval/eval.json") as f:
    data = json.load(f)

k = 3
p_scores, rr_scores, ndcg_scores = [], [], []

for ex in data:
    q = ex["question"]
    relevant = set(ex["relevant_sources"])
    retrieved = retrieve_sources(q, k=k)

    p_scores.append(precision_at_k(retrieved, relevant, k))
    rr_scores.append(reciprocal_rank(retrieved, relevant))
    ndcg_scores.append(ndcg_at_k(retrieved, relevant, k))

print(f"Precision@{k} =", sum(p_scores) / len(p_scores))
print("MRR =", sum(rr_scores) / len(rr_scores))
print(f"nDCG@{k} =", sum(ndcg_scores) / len(ndcg_scores))


Precision@3 = 0.3333333333333333
MRR = 1.0
nDCG@3 = 1.0


In [26]:
k = 1
p_scores = []

for ex in data:
    relevant = set(ex["relevant_sources"])
    retrieved = retrieve_sources(ex["question"], k=k)
    p_scores.append(precision_at_k(retrieved, relevant, k))

print(f"Precision@{k} =", sum(p_scores) / len(p_scores))


Precision@1 = 1.0


In [27]:
def dense_sources(question: str, k: int = 3):
    docs = vectorstore.similarity_search(question, k=k)
    return [d.metadata.get("source", "unknown") for d in docs]

def bm25_sources(question: str, k: int = 3):
    docs = bm25.invoke(question)[:k]
    return [d.metadata.get("source", "unknown") for d in docs]

def hybrid_sources(question: str, k: int = 3):
    docs = hybrid_retrieve(question, k_dense=k, k_bm25=k)
    return [d.metadata.get("source", "unknown") for d in docs[:k]]

def enterprise_sources(question: str, k: int = 3):
    docs = enterprise_retrieve(question, k_dense=5, k_bm25=5, k_final=k)
    return [d.metadata.get("source", "unknown") for d in docs]


In [28]:
def eval_strategy(source_fn, data, k=3):
    p_scores, rr_scores, ndcg_scores = [], [], []
    for ex in data:
        relevant = set(ex["relevant_sources"])
        retrieved = source_fn(ex["question"], k=k)

        p_scores.append(precision_at_k(retrieved, relevant, k))
        rr_scores.append(reciprocal_rank(retrieved, relevant))
        ndcg_scores.append(ndcg_at_k(retrieved, relevant, k))

    return {
        f"Precision@{k}": sum(p_scores) / len(p_scores),
        "MRR": sum(rr_scores) / len(rr_scores),
        f"nDCG@{k}": sum(ndcg_scores) / len(ndcg_scores),
    }

k = 3
scores = {
    "Dense": eval_strategy(dense_sources, data, k=k),
    "BM25": eval_strategy(bm25_sources, data, k=k),
    "Hybrid": eval_strategy(hybrid_sources, data, k=k),
    "Hybrid+Rerank": eval_strategy(enterprise_sources, data, k=k),
}

for name, s in scores.items():
    print("\n==", name, "==")
    for metric, val in s.items():
        print(metric, "=", val)



== Dense ==
Precision@3 = 0.3333333333333333
MRR = 1.0
nDCG@3 = 1.0

== BM25 ==
Precision@3 = 0.3333333333333333
MRR = 0.6666666666666666
nDCG@3 = 0.7539531690476383

== Hybrid ==
Precision@3 = 0.3333333333333333
MRR = 1.0
nDCG@3 = 1.0

== Hybrid+Rerank ==
Precision@3 = 0.3333333333333333
MRR = 1.0
nDCG@3 = 1.0
