
# 🧠 Semantic Search & RAG Engine for Enterprise Docs

This notebook replicates the Streamlit app for performing semantic search and retrieval-augmented generation (RAG) on uploaded PDF documents. It supports both local Hugging Face models and OpenAI GPT.

Developed by **Dr. Al Rey Villagracia**


In [None]:

!pip install PyMuPDF sentence-transformers scikit-learn torch transformers accelerate


In [None]:

import fitz  # PyMuPDF
import numpy as np
import os
import re
import time
import csv
from datetime import datetime
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch


In [None]:

def extract_text_from_pdf(file_path):
    doc = fitz.open(file_path)
    text = "\n".join([page.get_text() for page in doc])
    doc.close()
    return text

def chunk_text(text, chunk_size=200, overlap=50):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i+chunk_size])
        chunks.append(chunk)
    return chunks

def embed_chunks(chunks, model):
    return model.encode(chunks)

def search(query, model, index, chunks, top_k=5):
    query_vec = model.encode([query])
    sims = cosine_similarity(query_vec, index)[0]
    top_indices = np.argsort(sims)[::-1][:top_k]
    return [(chunks[i], sims[i]) for i in top_indices]

def heuristic_recall(query, top_chunks):
    query_lower = query.lower()
    hits = sum(1 for chunk, _ in top_chunks if query_lower in chunk.lower())
    return hits / len(top_chunks)

def measure_latency(func, *args, **kwargs):
    start = time.perf_counter()
    result = func(*args, **kwargs)
    end = time.perf_counter()
    return result, end - start

def generate_answer_local(query, top_chunks, tokenizer, model, max_tokens=256):
    context = "\n\n".join([chunk for chunk, _ in top_chunks])
    prompt = f"""Use the following context to answer the question:

    ---CONTEXT START---
    {context}
    ---CONTEXT END---

    Question: {query}
    Answer:"""
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    output = model.generate(**inputs, max_new_tokens=max_tokens, do_sample=True, top_p=0.95, temperature=0.7)
    return tokenizer.decode(output[0], skip_special_tokens=True).split("Answer:")[-1].strip()

def log_to_csv(query, llm_answer, retrieval_latency, llm_latency, recall, feedback=None, model_used=None, log_file="rag_query_log.csv"):
    fieldnames = ["timestamp", "query", "llm_answer", "retrieval_latency", "llm_latency", "recall", "feedback", "model_used"]
    row = {
        "timestamp": datetime.now().isoformat(),
        "query": query,
        "llm_answer": llm_answer,
        "retrieval_latency": retrieval_latency,
        "llm_latency": llm_latency,
        "recall": recall,
        "feedback": feedback,
        "model_used": model_used
    }
    file_exists = os.path.exists(log_file)
    with open(log_file, mode='a', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        if not file_exists:
            writer.writeheader()
        writer.writerow(row)


In [None]:

# Load PDF and model
pdf_path = "sample_test_doc.pdf"  # Replace with your own
query = "What are the KPIs mentioned?"

# Load embedding model
embed_model = SentenceTransformer('all-MiniLM-L6-v2')

# Load local LLM
hf_model_name = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(hf_model_name)
llm = AutoModelForCausalLM.from_pretrained(hf_model_name, torch_dtype=torch.float16, device_map="auto")

# Process document
text = extract_text_from_pdf(pdf_path)
chunks = chunk_text(text)
embeddings = embed_chunks(chunks, embed_model)
index = np.array(embeddings)

# Semantic search
top_chunks, retrieval_latency = measure_latency(search, query, embed_model, index, chunks)

# Generate LLM answer
answer, llm_latency = measure_latency(generate_answer_local, query, top_chunks, tokenizer, llm)

# Recall score
recall = heuristic_recall(query, top_chunks)

# Show results
print("Query:", query)
print("Answer:", answer)
print(f"Retrieval Latency: {retrieval_latency:.2f}s | LLM Latency: {llm_latency:.2f}s | Recall@k: {recall:.2f}")


In [None]:

# Log to CSV
log_to_csv(query, answer, retrieval_latency, llm_latency, recall, feedback="positive", model_used=hf_model_name)
print("✅ Logged to rag_query_log.csv")
