In [1]:
import os
from pypdf import PdfReader

from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

# ==============================
# CONFIG
# ==============================

ROOT = "../"

CENTRAL_PATH = os.path.join(ROOT, "central")
KARNATAKA_PATH = os.path.join(ROOT, "karnataka")

VECTOR_PATH = os.path.join(ROOT, "vector_db")

# ==============================
# PDF LOADER
# ==============================

def load_pdf_text(path):

    reader = PdfReader(path)
    text = ""

    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"

    return text

# ==============================
# COLLECT LAW DOCUMENTS
# ==============================

def collect_law_documents():

    documents = []

    # ---- CENTRAL LAWS ----
    for file in os.listdir(CENTRAL_PATH):
        if file.endswith(".pdf"):

            full_path = os.path.join(CENTRAL_PATH, file)
            text = load_pdf_text(full_path)

            documents.append(
                Document(
                    page_content=text,
                    metadata={
                        "law_type": "central",
                        "source": file
                    }
                )
            )

    # ---- KARNATAKA LAWS ----
    for file in os.listdir(KARNATAKA_PATH):
        if file.endswith(".pdf"):

            full_path = os.path.join(KARNATAKA_PATH, file)
            text = load_pdf_text(full_path)

            documents.append(
                Document(
                    page_content=text,
                    metadata={
                        "law_type": "karnataka",
                        "source": file
                    }
                )
            )

    return documents

# ==============================
# CHUNKING
# ==============================

def chunk_documents(documents):

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=700,       # better for legal text
        chunk_overlap=120
    )

    return splitter.split_documents(documents)

# ==============================
# BUILD VECTOR DATABASE
# ==============================

def build_vector_db():

    print("Collecting laws...")
    docs = collect_law_documents()

    print(f"Loaded {len(docs)} law documents")

    print("Chunking...")
    chunks = chunk_documents(docs)

    print(f"Created {len(chunks)} chunks")

    print("Creating local HF embeddings...")

    # LOCAL EMBEDDINGS (NO API KEY)
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )

    db = FAISS.from_documents(chunks, embeddings)

    db.save_local(VECTOR_PATH)

    print("Vector DB saved at:", VECTOR_PATH)

# ==============================
# RUN
# ==============================

if __name__ == "__main__":
    build_vector_db()


  from .autonotebook import tqdm as notebook_tqdm


Collecting laws...
Loaded 6 law documents
Chunking...
Created 2274 chunks
Creating local HF embeddings...


  embeddings = HuggingFaceEmbeddings(
Loading weights: 100%|██████████| 103/103 [00:00<00:00, 336.58it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Vector DB saved at: ../vector_db


## Loading the DB

In [2]:
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

db = FAISS.load_local(
    "../vector_db",
    embeddings,
    allow_dangerous_deserialization=True
)


Loading weights: 100%|██████████| 103/103 [00:00<00:00, 184.81it/s, Materializing param=pooler.dense.weight]                             
[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


takes 7 sec avg to load db

# Legal Law Vector Database – Handover Summary

1. This vector database is used for **legal compliance retrieval** and contains embeddings built from **Central law PDFs** and **Karnataka law PDFs** only (no templates or extracted JSONs). 

2. The embeddings were generated using **sentence-transformers/all-MiniLM-L6-v2**, and the same model **must** be used when loading the DB, otherwise similarity search will fail. 

3. The database files are `index.faiss` and `index.pkl`, both required together. Load the DB using HuggingFaceEmbeddings with the same model and FAISS `load_local()`.

4. Chunking settings used during creation: `chunk_size = 700`, `chunk_overlap = 120`; rebuilding with different settings changes retrieval behavior. 

5. First run downloads the MiniLM model (~90MB), after which everything runs locally and offline (no HuggingFace API key required). Do not rebuild the vector DB unless law PDFs change. Correct usage is to generate **legal queries** from extracted JSON and run similarity search; do not send full JSON directly into vector search. This DB is strictly a semantic retrieval layer — Gemini only reads retrieved text, not embeddings.

6. Required packages: pip install sentence-transformers langchain-core langchain-community langchain-text-splitters faiss-cpu pypdf 