In [1]:
from llama_index.core import Document, Settings
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from qdrant_client import QdrantClient
import json
from tqdm import tqdm
import os

In [2]:
# ---------- CONFIG ----------
COLLECTION = "saudi_labor_law"
EMBEDDING_DIM = 768
DATA_DIR = "data/labor_law"
JSON_PATH = os.path.join(DATA_DIR, "labor_law_parsed.json")

In [3]:
# ---------- LOAD PROCESSED DATA ----------
articles = {}
with open(JSON_PATH, "r", encoding="utf-8") as f:
    articles = json.load(f)

In [4]:
# ------------------------------------------------------
# 6๏ธโฃ Embedding + Qdrant Storage
# ------------------------------------------------------

In [5]:
# ---------- SETUP EMBEDDING MODEL ----------
embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-base")
Settings.embed_model = embed_model
#Settings.embed_metadata = False  # we embed manually controlled header instead
Settings.chunk_size = 5096  # or even 8192 if your system has RAM

In [8]:
print("๐ Connecting to Qdrant...")
qdrant_client = QdrantClient(url="http://localhost:6333")
# Clean & create collection
try:
    qdrant_client.delete_collection(COLLECTION)
except Exception:
    pass

๐ Connecting to Qdrant...


In [9]:
qdrant_client.create_collection(
    collection_name=COLLECTION,
    vectors_config={"size": EMBEDDING_DIM, "distance": "Cosine"}
)

True

In [10]:
vector_store = QdrantVectorStore(client=qdrant_client, collection_name=COLLECTION)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [11]:

# ---------- BUILD DOCUMENTS ----------
docs = []

for art in articles:
    # Build short searchable header
    header_parts = []
    #if art.get("part_number"):
    #    header_parts.append(f"ุงูุจุงุจ {art.get('part_number_ar','')} ({art.get('part_number','')}) {art.get('part_title_ar','')}")
    #if art.get("chapter_number"):
    #    header_parts.append(f"ุงููุตู {art['chapter_number_ar']} ({art.get('chapter_number','')}) {art.get('chapter_title_ar','')}")
    #if art.get("index"):
    #    header_parts.append(f"{art['arabic_name']} ({art.get('index','')})")

    header_text = "\n".join(header_parts)

    # Embed Arabic content only (header + article body)
    doc_text = f"{header_text}\n\n{art['arabic_content']}"

    # Metadata for display and structured filtering
    metadata = {
        "part_title": art.get("part_title_ar"),
        "part_number": art.get("part_number"),
        "chapter_title": art.get("chapter_title_ar"),
        "chapter_number": art.get("chapter_number"),
        "article_name": art.get("article_name"),
        "article_number_ar": art.get("number_ar"),
        "article_number": art.get("index"),
        "english_number": art.get("english_number"),
        "arabic": art.get("arabic_content", ""),
        "english": art.get("english_content", "")
    }
 

    docs.append(Document(text=doc_text, metadata=art))

In [12]:
# ---------- STEP 5: Build the index ----------
# ---------- BUILD INDEX ----------
print(f"๐ Embedding {len(docs)} articles (Arabic + structural header)...")
index = VectorStoreIndex.from_documents(docs, storage_context=storage_context)
print("โ Successfully indexed all articles into Qdrant!")

๐ Embedding 249 articles (Arabic + structural header)...
โ Successfully indexed all articles into Qdrant!


In [13]:
retriever = VectorIndexRetriever(index=index, similarity_top_k=3)
queries = [
    "ูุง ูู ุงููุงุฏุฉ 28ุ",
    "What does Article 23 say?",
    "ุงููุตู ุงูุซุงูู ุงูุฅุฌุงุฒุงุช"
]

In [14]:
for q in queries:
    print(f"\n๐ Query: {q}")
    results = retriever.retrieve(q)
    for r in results:
        md = r.node.metadata
        print(f"๐ {md.get('article_name')} | {md.get('chapter_title')} |  {md.get('part_title_ar')}")
        print(f"AR:  {md.get('index')}")
        print(f"EN: {md.get('english_content')[:120]}...\n")


๐ Query: ูุง ูู ุงููุงุฏุฉ 28ุ
๐ None | None |  ุงูุนููุจุงุช
AR:  242
EN: 105 (Cancelled)...

๐ None | None |  ุชูุชูุด ุงูุนูู
AR:  211
EN: 74 (Cancelled)...

๐ None | None |  ุงูุชุนุฑููุงุช ูุงูุฃุญูุงู ุงูุนุงูุฉ
AR:  8
EN: Any condition that contravenes the provisions of these Regulations, and any discharge or conciliation of the rights aris...


๐ Query: What does Article 23 say?
๐ None | None |  ุงูุนููุจุงุช
AR:  233
EN: 96. A fine of not less than 200,000 rials and not more than 500,000 rials shall be imposed on anyone who contravenes the...

๐ None | None |  ุชูุธูู ุบูุฑ ุงูุณุนูุฏููู
AR:  33
EN: Proceedings for the purpose of work may be made only with the approval of the Ministry....

๐ None | None |  ุงูุชุนุฑููุงุช ูุงูุฃุญูุงู ุงูุนุงูุฉ
AR:  22
EN: In order to implement the provisions of this regulation, the Minister shall coordinate with the relevant authorities whe...


๐ Query: ุงููุตู ุงูุ

In [60]:
index = list()
for art in articles:
    index.append({art['index']:art['number_ar']})

In [61]:
index

[{1: 'ุงูุฃููู'},
 {2: 'ุงูุซุงููุฉ'},
 {3: 'ุงูุซุงูุซุฉ'},
 {4: 'ุงูุฑุงุจุนุฉ'},
 {5: 'ุงูุฎุงูุณุฉ'},
 {6: 'ุงูุณุงุฏุณุฉ'},
 {7: 'ุงูุณุงุจุนุฉ'},
 {8: 'ุงูุซุงููุฉ'},
 {9: 'ุงูุชุงุณุนุฉ'},
 {10: 'ุงูุนุงุดุฑุฉ'},
 {11: 'ุงูุญุงุฏูุฉ ุนุดุฑุฉ'},
 {11: 'ุงูุญุงุฏูุฉ ุนุดุฑุฉ ููุฑุฑ'},
 {12: 'ุงูุซุงููุฉ ุนุดุฑุฉ'},
 {13: 'ุงูุซุงูุซุฉ ุนุดุฑุฉ'},
 {14: 'ุงูุฑุงุจุนุฉ ุนุดุฑุฉ'},
 {15: 'ุงูุฎุงูุณุฉ ุนุดุฑุฉ'},
 {16: 'ุงูุณุงุฏุณุฉ ุนุดุฑุฉ'},
 {17: 'ุงูุณุงุจุนุฉ ุนุดุฑุฉ'},
 {18: 'ุงูุซุงููุฉ ุนุดุฑุฉ'},
 {19: 'ุงูุชุงุณุนุฉ ุนุดุฑุฉ'},
 {20: 'ุงูุนุดุฑูู'},
 {21: 'ุงูุญุงุฏูุฉ ูุงูุนุดุฑูู'},
 {22: 'ุงูุซุงููุฉ ูุงูุนุดุฑูู'},
 {23: 'ุงูุซุงูุซุฉ ูุงูุนุดุฑูู'},
 {24: 'ุงูุฑุงุจุนุฉ ูุงูุนุดุฑูู'},
 {25: 'ุงูุฎุงูุณุฉ ูุงูุนุดุฑูู'},
 {26: 'ุฎูุงู ุดูุฑ ูุญุฑู ูู ูู ุนุงู. \n \nุงููุงุฏุฉ ุงูุณุงุฏุณุฉ ูุงูุนุดุฑูู'},
 {27: 'ุงูุณุงุจุนุฉ ูุงูุนุดุฑูู'},
 {28: 'ุงูุซุงููุฉ ูุงูุนุด