In [1]:
import os
from typing import List, Dict, Tuple
from dataclasses import dataclass
import pandas as pd

In [105]:
# LlamaIndex core
from llama_index.core import (
    VectorStoreIndex,
    StorageContext,
    Document,
    Settings,
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.llms.openai import OpenAI

In [106]:
# Nodes & postprocessors
from llama_index.core.schema import TextNode, NodeWithScore
from llama_index.core.postprocessor import SentenceTransformerRerank

In [107]:
# Qdrant
from qdrant_client import QdrantClient

# BM25
from rank_bm25 import BM25Okapi

# Data
from datasets import load_dataset

# Similarity
from sentence_transformers import SentenceTransformer, util
import numpy as np

In [None]:
# =========================
# CONFIG
# =========================
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
QDRANT_COLLECTION = os.getenv("QDRANT_COLLECTION", "maktek_faqs")
EMBED_MODEL_NAME = "intfloat/multilingual-e5-base"
RERANK_MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"

K = 5  # we’ll evaluate @5 as requested
# knobs
VEC_TOP_K = max(12, K)      # gather more for better recall
BM25_TOP_K = max(12, K)
RERANK_TOP_N = max(8, K)
FINAL_TOP_N = K  

# Optional LLM
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
LLM_TEMPERATURE = 0

In [109]:
@dataclass
class CorpusItem:
    doc_id: str
    text: str
    metadata: Dict

In [None]:
# -----------------------------
# DATA LOADING
# -----------------------------
def load_maktek_dataset() -> List[Document]:
    #ds = load_dataset("MakTek/Customer_support_faqs_dataset", split="train")
    df = pd.read_json("hf://datasets/MakTek/Customer_support_faqs_dataset/train_expanded.json", lines=True)
    df = df.drop_duplicates(subset="question")
    df.insert(0,'id',df.index)
    df.to_csv("../data/data.csv",index=False)
    df_dict = df.to_dict(orient='records')
    docs: List[Document] = []
    for i, row in enumerate(df_dict):
        q = (row.get("question") or "").strip()
        a = (row.get("answer") or "").strip()
        doc_id = f"faq-{i:04d}"
        text = f"Q: {q}\n\nA: {a}"
        metadata = {"question": q, "answer": a, "source": "MakTek", "doc_id": doc_id}
        docs.append(Document(text=text, metadata=metadata, doc_id=doc_id))
    return docs

In [111]:
# -----------------------------
# INDEX & BM25 BUILD
# -----------------------------
def build_index(docs: List[Document]) -> VectorStoreIndex:
    Settings.embed_model = HuggingFaceEmbedding(model_name=EMBED_MODEL_NAME)
    client = QdrantClient(url=QDRANT_URL)
    vector_store = QdrantVectorStore(client=client, collection_name=QDRANT_COLLECTION)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    index = VectorStoreIndex.from_documents(docs, storage_context=storage_context, show_progress=True)
    return index

In [None]:
def simple_tokenize(s: str) -> List[str]:
    return s.lower().split()


def build_bm25_corpus(docs: List[Document]) -> Tuple[BM25Okapi, Dict[str, CorpusItem]]:
    corpus_items: Dict[str, CorpusItem] = {}
    tokenized_corpus = []
    for d in docs:
        doc_id = d.doc_id or d.metadata.get("doc_id")
        corpus_items[doc_id] = CorpusItem(doc_id=doc_id, text=d.text, metadata=d.metadata)
        tokenized_corpus.append(simple_tokenize(d.text))
    bm25 = BM25Okapi(tokenized_corpus)
    return bm25, corpus_items

In [None]:

# -----------------------------
# RETRIEVAL HELPERS
# -----------------------------
def bm25_retrieve(bm25: BM25Okapi, corpus_items: Dict[str, CorpusItem], query: str, top_k: int) -> List[NodeWithScore]:
    scores = bm25.get_scores(simple_tokenize(query))
    ranked = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)[:top_k]
    doc_ids = list(corpus_items.keys())
    results: List[NodeWithScore] = []
    for idx, sc in ranked:
        doc_id = doc_ids[idx]
        item = corpus_items[doc_id]
        node = TextNode(id_=doc_id, text=item.text, metadata=item.metadata)
        results.append(NodeWithScore(node=node, score=float(sc)))
    return results

In [114]:
def vector_retrieve(index: VectorStoreIndex, query: str, top_k: int) -> List[NodeWithScore]:
    retriever = index.as_retriever(similarity_top_k=top_k)
    return retriever.retrieve(query)

In [115]:
def merge_candidates(vec_nodes: List[NodeWithScore], bm25_nodes: List[NodeWithScore]) -> List[NodeWithScore]:
    """
    ✅ Fix: merge by stable metadata['doc_id'] to remove duplicates from vector + BM25.
    """
    merged: Dict[str, NodeWithScore] = {}
    for n in vec_nodes + bm25_nodes:
        meta_doc_id = n.metadata.get("doc_id") if n.metadata else n.node.node_id
        if meta_doc_id not in merged or merged[meta_doc_id].score < n.score:
            merged[meta_doc_id] = n
    return list(merged.values())

In [116]:
def semantic_deduplicate(results: List[NodeWithScore], top_n: int = FINAL_TOP_N) -> List[NodeWithScore]:
    """
    ✅ Deduplicate only by *exact question text*.
    Removes repeated identical questions but keeps paraphrased versions.
    """
    seen_questions = set()
    deduped: List[NodeWithScore] = []

    for node in results:
        question = (node.metadata.get("question") or "").strip().lower()
        if question not in seen_questions:
            seen_questions.add(question)
            deduped.append(node)
        if len(deduped) >= top_n:
            break

    return deduped


In [117]:
def build_reranker() -> SentenceTransformerRerank:
    """
    ✅ Centralized reranker builder.
    Useful if you want to change reranker models or parameters in one place.
    """
    return SentenceTransformerRerank(
        model=RERANK_MODEL_NAME,
        top_n=RERANK_TOP_N,
    )

In [118]:
# -----------------------------
# MAIN QUERY METHODS
# -----------------------------
def query_without_llm(index: VectorStoreIndex, bm25: BM25Okapi, corpus_items: Dict[str, CorpusItem], query: str) -> List[Dict]:
    vec_nodes = vector_retrieve(index, query, VEC_TOP_K)
    bm25_nodes = bm25_retrieve(bm25, corpus_items, query, BM25_TOP_K)
    merged = merge_candidates(vec_nodes, bm25_nodes)

    reranker = build_reranker()
    reranked = reranker.postprocess_nodes(merged, query_str=query)

    deduped = semantic_deduplicate(reranked, top_n=FINAL_TOP_N)

    results = []
    for n in deduped:
        results.append({
            "doc_id": n.metadata.get("doc_id"),
            "question": n.metadata.get("question"),
            "answer": n.metadata.get("answer"),
            "score": n.score
        })
    return results 

In [None]:
def query_with_llm(index: VectorStoreIndex, bm25: BM25Okapi, corpus_items: Dict[str, CorpusItem], query: str) -> Dict:
    if not os.getenv("OPENAI_API_KEY"):
        raise RuntimeError("OPENAI_API_KEY is not set.")

    top_ctx = query_without_llm(index, bm25, corpus_items, query)
    context_text = "\n\n".join([f"[{i+1}] Q: {c['question']}\nA: {c['answer']}" for i, c in enumerate(top_ctx)])

    prompt = f"""You are a helpful support assistant. Answer the user's question using ONLY the context below.
If the answer is not present, say you don't have enough information.

User question:
{query}

Context:
{context_text}

Return a concise, direct answer.
"""

    llm = OpenAI(model=OPENAI_MODEL, temperature=LLM_TEMPERATURE)
    completion = llm.complete(prompt)
    return {"query": query, "answer": completion.text.strip(), "top_context": top_ctx}


In [120]:
print("📥 Loading MakTek dataset...")
docs = load_maktek_dataset()
print(f"Loaded {len(docs)} FAQ entries.")

📥 Loading MakTek dataset...
Loaded 89 FAQ entries.


In [121]:
print("📊 Building vector index...")
index = build_index(docs)

📊 Building vector index...


Parsing nodes:   0%|          | 0/89 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/89 [00:00<?, ?it/s]

In [122]:
print("Building BM25 over the same corpus ...")
bm25, corpus_items = build_bm25_corpus(docs)

Building BM25 over the same corpus ...


In [123]:
# Demo 1: Retrieval only
print("\n=== Retrieval-Only ===")
for q in [
    "How do I reset my password?",
    "Do you offer international shipping?",
    "What is your refund policy?",
]:
    print("\nQ:", q)
    results = query_without_llm(index, bm25, corpus_items, q)
    for i, r in enumerate(results, 1):
        print(f"  {i}. [{r['doc_id']}] {r['question']} (score={r['score']:.4f})")


=== Retrieval-Only ===

Q: How do I reset my password?
  1. [faq-0109] How can I reset my password? (score=10.1189)

Q: Do you offer international shipping?
  1. [faq-0006] Do you offer international shipping? (score=10.9858)
  2. [faq-0033] Can I order a product for delivery to a different country? (score=9.1520)
  3. [faq-0022] Do you offer expedited shipping? (score=2.7228)
  4. [faq-0017] Do you offer bulk or wholesale discounts? (score=-4.5605)
  5. [faq-0085] Do you offer a satisfaction guarantee? (score=-7.7589)

Q: What is your refund policy?
  1. [faq-0003] What is your return policy? (score=6.4888)
  2. [faq-0054] Can I return a product if it was purchased with a discount code? (score=-3.8403)
  3. [faq-0014] What is your price adjustment policy? (score=-3.9959)
  4. [faq-0088] Can I request a refund if the price drops after my purchase? (score=-4.2463)
  5. [faq-0081] What is your privacy policy? (score=-6.1638)
