In [1]:
from llama_index.core import Document, Settings
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from qdrant_client import QdrantClient
import json
from tqdm import tqdm
import os

In [2]:
# ---------- CONFIG ----------
COLLECTION = "saudi_labor_law"
EMBEDDING_DIM = 768
DATA_DIR = "data/labor_law"
JSON_PATH = os.path.join(DATA_DIR, "labor_law_parsed.json")

In [3]:
# ---------- LOAD PROCESSED DATA ----------
articles = {}
with open(JSON_PATH, "r", encoding="utf-8") as f:
    articles = json.load(f)

In [4]:
# ------------------------------------------------------
# 6️⃣ Embedding + Qdrant Storage
# ------------------------------------------------------

In [5]:
# ---------- SETUP EMBEDDING MODEL ----------
embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-base")
Settings.embed_model = embed_model
#Settings.embed_metadata = False  # we embed manually controlled header instead
Settings.chunk_size = 5096  # or even 8192 if your system has RAM

In [8]:
print("🔗 Connecting to Qdrant...")
qdrant_client = QdrantClient(url="http://localhost:6333")
# Clean & create collection
try:
    qdrant_client.delete_collection(COLLECTION)
except Exception:
    pass

🔗 Connecting to Qdrant...


In [9]:
qdrant_client.create_collection(
    collection_name=COLLECTION,
    vectors_config={"size": EMBEDDING_DIM, "distance": "Cosine"}
)

True

In [10]:
vector_store = QdrantVectorStore(client=qdrant_client, collection_name=COLLECTION)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [11]:

# ---------- BUILD DOCUMENTS ----------
docs = []

for art in articles:
    # Build short searchable header
    header_parts = []
    #if art.get("part_number"):
    #    header_parts.append(f"الباب {art.get('part_number_ar','')} ({art.get('part_number','')}) {art.get('part_title_ar','')}")
    #if art.get("chapter_number"):
    #    header_parts.append(f"الفصل {art['chapter_number_ar']} ({art.get('chapter_number','')}) {art.get('chapter_title_ar','')}")
    #if art.get("index"):
    #    header_parts.append(f"{art['arabic_name']} ({art.get('index','')})")

    header_text = "\n".join(header_parts)

    # Embed Arabic content only (header + article body)
    doc_text = f"{header_text}\n\n{art['arabic_content']}"

    # Metadata for display and structured filtering
    metadata = {
        "part_title": art.get("part_title_ar"),
        "part_number": art.get("part_number"),
        "chapter_title": art.get("chapter_title_ar"),
        "chapter_number": art.get("chapter_number"),
        "article_name": art.get("article_name"),
        "article_number_ar": art.get("number_ar"),
        "article_number": art.get("index"),
        "english_number": art.get("english_number"),
        "arabic": art.get("arabic_content", ""),
        "english": art.get("english_content", "")
    }
 

    docs.append(Document(text=doc_text, metadata=art))

In [12]:
# ---------- STEP 5: Build the index ----------
# ---------- BUILD INDEX ----------
print(f"🔗 Embedding {len(docs)} articles (Arabic + structural header)...")
index = VectorStoreIndex.from_documents(docs, storage_context=storage_context)
print("✅ Successfully indexed all articles into Qdrant!")

🔗 Embedding 249 articles (Arabic + structural header)...
✅ Successfully indexed all articles into Qdrant!


In [13]:
retriever = VectorIndexRetriever(index=index, similarity_top_k=3)
queries = [
    "ما هي المادة 28؟",
    "What does Article 23 say?",
    "الفصل الثاني الإجازات"
]

In [14]:
for q in queries:
    print(f"\n🔍 Query: {q}")
    results = retriever.retrieve(q)
    for r in results:
        md = r.node.metadata
        print(f"📘 {md.get('article_name')} | {md.get('chapter_title')} |  {md.get('part_title_ar')}")
        print(f"AR:  {md.get('index')}")
        print(f"EN: {md.get('english_content')[:120]}...\n")


🔍 Query: ما هي المادة 28؟
📘 None | None |  العقوبات
AR:  242
EN: 105 (Cancelled)...

📘 None | None |  تفتيش العمل
AR:  211
EN: 74 (Cancelled)...

📘 None | None |  التعريفات والأحكام العامة
AR:  8
EN: Any condition that contravenes the provisions of these Regulations, and any discharge or conciliation of the rights aris...


🔍 Query: What does Article 23 say?
📘 None | None |  العقوبات
AR:  233
EN: 96. A fine of not less than 200,000 rials and not more than 500,000 rials shall be imposed on anyone who contravenes the...

📘 None | None |  توظيف غير السعوديين
AR:  33
EN: Proceedings for the purpose of work may be made only with the approval of the Ministry....

📘 None | None |  التعريفات والأحكام العامة
AR:  22
EN: In order to implement the provisions of this regulation, the Minister shall coordinate with the relevant authorities whe...


🔍 Query: الفصل الثاني الإجازات
📘 None | None |  شروط العمل وظروفه
AR:  114
EN: Every worker has the right to leave with full pay during the holidays and

In [60]:
index = list()
for art in articles:
    index.append({art['index']:art['number_ar']})

In [61]:
index

[{1: 'الأولى'},
 {2: 'الثانية'},
 {3: 'الثالثة'},
 {4: 'الرابعة'},
 {5: 'الخامسة'},
 {6: 'السادسة'},
 {7: 'السابعة'},
 {8: 'الثامنة'},
 {9: 'التاسعة'},
 {10: 'العاشرة'},
 {11: 'الحادية عشرة'},
 {11: 'الحادية عشرة مكرر'},
 {12: 'الثانية عشرة'},
 {13: 'الثالثة عشرة'},
 {14: 'الرابعة عشرة'},
 {15: 'الخامسة عشرة'},
 {16: 'السادسة عشرة'},
 {17: 'السابعة عشرة'},
 {18: 'الثامنة عشرة'},
 {19: 'التاسعة عشرة'},
 {20: 'العشرون'},
 {21: 'الحادية والعشرون'},
 {22: 'الثانية والعشرون'},
 {23: 'الثالثة والعشرون'},
 {24: 'الرابعة والعشرون'},
 {25: 'الخامسة والعشرون'},
 {26: 'خلال شهر محرم من كل عام. \n \nالمادة السادسة والعشرون'},
 {27: 'السابعة والعشرون'},
 {28: 'الثامنة والعشرون'},
 {29: 'التاسعة والعشرون'},
 {30: 'الثلاثون'},
 {31: 'الحادية والثلاثون'},
 {32: 'الثانية والثلاثون'},
 {33: 'الثالثة والثلاثون'},
 {34: 'الرابعة والثلاثون'},
 {35: 'الخامسة والثلاثون'},
 {36: 'السادسة والثلاثون'},
 {37: 'السابعة والثلاثون'},
 {38: 'الثامنة والثلاثون'},
 {39: 'التاسعة والثلاثون'},
 {40: 'الأربعون'},
 {41: '