In [None]:
from llama_index.core import Document, Settings
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from qdrant_client import QdrantClient
import json
from tqdm import tqdm
import os

In [None]:
# ---------- CONFIG ----------
COLLECTION = "saudi_labor_law"
EMBEDDING_DIM = 768
DATA_DIR = "data/labor_law"
JSON_PATH = os.path.join(DATA_DIR, "labor_law_parsed.json")

In [None]:
# ---------- LOAD PROCESSED DATA ----------
articles = {}
with open(JSON_PATH, "r", encoding="utf-8") as f:
    articles = json.load(f)

In [None]:
# ------------------------------------------------------
# 6️⃣ Embedding + Qdrant Storage
# ------------------------------------------------------

In [None]:
# ---------- SETUP EMBEDDING MODEL ----------
embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-base")
Settings.embed_model = embed_model
#Settings.embed_metadata = False  # we embed manually controlled header instead
Settings.chunk_size = 5096  # or even 8192 if your system has RAM

In [None]:
print("🔗 Connecting to Qdrant...")
qdrant_client = QdrantClient(url="http://localhost:6333")
# Clean & create collection
try:
    qdrant_client.delete_collection(COLLECTION)
except Exception:
    pass

In [None]:
qdrant_client.create_collection(
    collection_name=COLLECTION,
    vectors_config={"size": EMBEDDING_DIM, "distance": "Cosine"}
)

In [None]:
vector_store = QdrantVectorStore(client=qdrant_client, collection_name=COLLECTION)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:

# ---------- BUILD DOCUMENTS ----------
docs = []

for art in articles:
    # Build short searchable header
    header_parts = []
    #if art.get("part_number"):
    #    header_parts.append(f"الباب {art.get('part_number_ar','')} ({art.get('part_number','')}) {art.get('part_title_ar','')}")
    #if art.get("chapter_number"):
    #    header_parts.append(f"الفصل {art['chapter_number_ar']} ({art.get('chapter_number','')}) {art.get('chapter_title_ar','')}")
    #if art.get("index"):
    #    header_parts.append(f"{art['arabic_name']} ({art.get('index','')})")

    header_text = "\n".join(header_parts)

    # Embed Arabic content only (header + article body)
    doc_text = f"{header_text}\n\n{art['arabic_content']}"

    # Metadata for display and structured filtering
    metadata = {
        "part_title": art.get("part_title_ar"),
        "part_number": art.get("part_number"),
        "chapter_title": art.get("chapter_title_ar"),
        "chapter_number": art.get("chapter_number"),
        "article_name": art.get("article_name"),
        "article_number_ar": art.get("number_ar"),
        "article_number": art.get("index"),
        "english_number": art.get("english_number"),
        "arabic": art.get("arabic_content", ""),
        "english": art.get("english_content", "")
    }
 

    docs.append(Document(text=doc_text, metadata=art))

In [None]:
# ---------- STEP 5: Build the index ----------
# ---------- BUILD INDEX ----------
print(f"🔗 Embedding {len(docs)} articles (Arabic + structural header)...")
index = VectorStoreIndex.from_documents(docs, storage_context=storage_context)
print("✅ Successfully indexed all articles into Qdrant!")

In [None]:
retriever = VectorIndexRetriever(index=index, similarity_top_k=3)
queries = [
    "ما هي المادة 28؟",
    "What does Article 23 say?",
    "الفصل الثاني الإجازات"
]

In [None]:
for q in queries:
    print(f"\n🔍 Query: {q}")
    results = retriever.retrieve(q)
    for r in results:
        md = r.node.metadata
        print(f"📘 {md.get('article_name')} | {md.get('chapter_title')} |  {md.get('part_title_ar')}")
        print(f"AR:  {md.get('index')}")
        print(f"EN: {md.get('english_content')[:120]}...\n")