In [1]:
import os
from typing import List, Dict, Tuple
from dataclasses import dataclass
import pandas as pd
# LlamaIndex core
from llama_index.core import (
    VectorStoreIndex,
    StorageContext,
    Document,
    Settings,
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.llms.openai import OpenAI
# Nodes & postprocessors
from llama_index.core.schema import TextNode, NodeWithScore
from llama_index.core.postprocessor import SentenceTransformerRerank
# Qdrant
from qdrant_client import QdrantClient

# BM25
from rank_bm25 import BM25Okapi

# Data
from datasets import load_dataset

# Similarity
from sentence_transformers import SentenceTransformer, util
import numpy as np


In [2]:
# --- Config ---
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
QDRANT_COLLECTION = os.getenv("QDRANT_COLLECTION", "maktek_faqs")
EMBED_MODEL_NAME = "intfloat/multilingual-e5-base"
RERANK_MODEL_NAME = "cross-encoder/ms-marco-MiniLM-L-6-v2"
K = 5  # we’ll evaluate @5 as requested
# knobs
VEC_TOP_K = max(12, K)      # gather more for better recall
BM25_TOP_K = max(12, K)
RERANK_TOP_N = max(8, K)
FINAL_TOP_N = K  

In [3]:
@dataclass
class CorpusItem:
    doc_id: str
    text: str  
    metadata: Dict

In [None]:
# -----------------------------
# DATA LOADING
# -----------------------------
def load_maktek_dataset() -> List[Document]:
    #ds = load_dataset("MakTek/Customer_support_faqs_dataset", split="train")
    df = pd.read_csv('../data/data.csv')
    df_dict = df.to_dict(orient='records')
    docs: List[Document] = []
    for i, row in enumerate(df_dict):
        q = (row.get("question") or "").strip()
        a = (row.get("answer") or "").strip()
        doc_id = f"faq-{i:04d}"
        text = f"Q: {q}\n\nA: {a}"
        metadata = {"question": q, "answer": a, "source": "MakTek", "doc_id": doc_id}
        docs.append(Document(text=text, metadata=metadata, doc_id=doc_id))
    return docs  

In [5]:
# --------------------------
# Connect to Qdrant collection
# --------------------------
def connect_to_index() -> VectorStoreIndex:
    embed_model = HuggingFaceEmbedding(model_name=EMBED_MODEL_NAME)
    client = QdrantClient(url=QDRANT_URL)
    vector_store = QdrantVectorStore(client=client, collection_name=QDRANT_COLLECTION)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    return VectorStoreIndex.from_vector_store(vector_store=vector_store,
    storage_context=storage_context,
    embed_model=embed_model)


In [6]:
# --------------------------
# BM25 from metadata (optional, if needed)
# --------------------------
def simple_tokenize(s: str) -> List[str]:
    return s.lower().split()

    bm25 = BM25Okapi(tokenized_corpus)
    return bm25, corpus_items

def build_bm25_corpus(docs: List[Document]) -> Tuple[BM25Okapi, Dict[str, CorpusItem]]:
    corpus_items: Dict[str, CorpusItem] = {}
    tokenized_corpus = []
    for d in docs:
        doc_id = d.doc_id or d.metadata.get("doc_id")
        corpus_items[doc_id] = CorpusItem(doc_id=doc_id, text=d.text, metadata=d.metadata)
        tokenized_corpus.append(simple_tokenize(d.text))
    bm25 = BM25Okapi(tokenized_corpus)
    return bm25, corpus_items

In [7]:

# -----------------------------
# RETRIEVAL HELPERS
# -----------------------------
def bm25_retrieve(bm25: BM25Okapi, corpus_items: Dict[str, CorpusItem], query: str, top_k: int) -> List[NodeWithScore]:
    scores = bm25.get_scores(simple_tokenize(query))
    ranked = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)[:top_k]
    doc_ids = list(corpus_items.keys())
    results: List[NodeWithScore] = []
    for idx, sc in ranked:
        doc_id = doc_ids[idx]
        item = corpus_items[doc_id]
        node = TextNode(id_=doc_id, text=item.text, metadata=item.metadata)
        results.append(NodeWithScore(node=node, score=float(sc)))
    return results

In [8]:
def vector_retrieve(index: VectorStoreIndex, query: str, top_k: int) -> List[NodeWithScore]:
    retriever = index.as_retriever(similarity_top_k=top_k)
    return retriever.retrieve(query)

In [9]:
# --------------------------
# Reranker builder
# --------------------------
def build_reranker() -> SentenceTransformerRerank:
    return SentenceTransformerRerank(model=RERANK_MODEL_NAME, top_n=RERANK_TOP_N)

In [10]:
def as_results(nodes: List[NodeWithScore]) -> List[Dict]:
    """Convert NodeWithScore → dict rows."""
    out = []
    for n in nodes[:FINAL_TOP_N]:
        out.append({
            "doc_id": n.metadata.get("doc_id") if n.metadata else n.node.node_id,
            "question": n.metadata.get("question", ""),
            "answer": n.metadata.get("answer", ""),
            "score": float(n.score or 0.0),
        })
    return out

def dedup_exact_question(rows: List[Dict]) -> List[Dict]:
    """Keep only the first instance of identical question text (case-insensitive)."""
    seen = set()
    uniq = []
    for r in rows:
        key = (r.get("question") or "").strip().lower()
        if key not in seen:
            seen.add(key)
            uniq.append(r)
        if len(uniq) >= FINAL_TOP_N:
            break
    return uniq

In [11]:

# --------------------------
# Query (Retrieval Only)
# --------------------------
def query_without_llm(index: VectorStoreIndex, query: str) -> List[Dict]:
    retriever = index.as_retriever(similarity_top_k=VEC_TOP_K)
    results = retriever.retrieve(query)
    reranker = build_reranker()
    reranked = reranker.postprocess_nodes(results, query_str=query)

    # Deduplicate by question text
    seen_questions = set()
    final_results = []
    for n in reranked:
        q = (n.metadata.get("question") or "").strip().lower()
        if q not in seen_questions:
            seen_questions.add(q)
            final_results.append({
                "doc_id": n.metadata.get("doc_id"),
                "question": n.metadata.get("question"),
                "answer": n.metadata.get("answer"),
                "score": n.score
            })
        if len(final_results) >= FINAL_TOP_N:
            break

    return final_results

In [12]:

# --------------------------
# Query (With LLM)
# --------------------------
def query_with_llm(index: VectorStoreIndex, query: str) -> Dict:
    if not os.getenv("OPENAI_API_KEY"):
        raise RuntimeError("OPENAI_API_KEY is not set.")

    results = query_without_llm(index, query)
    context_text = "\n\n".join([f"[{i+1}] Q: {r['question']}\nA: {r['answer']}" for i, r in enumerate(results)])

    prompt = f"""You are a helpful support assistant. Answer the user's question using ONLY the context below.
If the answer is not present, say you don't have enough information.

User question:
{query}

Context:
{context_text}

Return a concise, direct answer.
"""

    llm = OpenAI(model="gpt-4o-mini", temperature=0)
    completion = llm.complete(prompt)

    return {
        "query": query,
        "answer": completion.text.strip(),
        "top_context": results
    }

In [None]:
def prepare_search() -> Dict:
    docs = load_maktek_dataset()
    index = connect_to_index()
    bm25, corpus_items = build_bm25_corpus(docs)
    return {'docs': docs ,'index':index,'bm25':bm25,'corpus_items':corpus_items}
