In [7]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, Settings , StorageContext
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client import QdrantClient, models
from llama_index.core.retrievers import VectorIndexRetriever

In [8]:
# ---------- CONFIG ----------
POLICY_DIR = "data/policies"  # put your PDFs here
COLLECTION = "labor_law_ar"
EMBEDDING_DIMENSIONALITY = 768

In [9]:
### ---------- STEP 1: Load documents ----------####
### SimpleDirectoryReader auto-reads PDFs, DOCX, TXT###

In [10]:
raw_docs = SimpleDirectoryReader(POLICY_DIR).load_data()



In [40]:
### For multi-service tagging, you can enrich metadata here ###
### Example: tag by keywords (leave, overtime, payroll, etc.)
raw_docs

[Document(id_='7af00579-728a-4e89-9b76-6759554d8e39', embedding=None, metadata={'page_label': '1', 'file_name': 'labor_law_ar.pdf', 'file_path': 'c:\\Users\\zaina\\OneDrive\\llm-zoomcamp\\llm-zoomcamp\\hr_assistant_LlamaIndex\\data\\policies\\labor_law_ar.pdf', 'file_type': 'application/pdf', 'file_size': 1141447, 'creation_date': '2025-10-06', 'last_modified_date': '2025-10-03'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='0  \n      \nرداصلا  موسرملاب  يكلملا  مقر ) م /51( خيراتو  23/8/1426 ه لدعملا  موسرملاب  يكلملا  مقر ) م /24( خيراتو  12/5/1434 ـه   لدعملا  موسرملاب  يكلملا  مقر ) م /١ ( خيراتو  22/1/1435 ـه   لدعملا  موسرملاب  ي

In [12]:
### ---------- STEP 2: Define embedding model ----------

In [13]:
embed_model = HuggingFaceEmbedding(model_name="intfloat/multilingual-e5-base")
Settings.embed_model = embed_model  # set default embedding model globally

2025-10-06 15:35:20,157 - INFO - Load pretrained SentenceTransformer: intfloat/multilingual-e5-base


In [14]:
### ---------- STEP 3: Connect to Qdrant ----------

In [15]:
qdrant_client = QdrantClient(url="http://localhost:6333")

2025-10-06 15:35:28,297 - INFO - HTTP Request: GET http://localhost:6333 "HTTP/1.1 200 OK"


In [16]:
#### Create the collection with specified vector parameters

In [17]:
qdrant_client.delete_collection(COLLECTION)
qdrant_client.create_collection(
    collection_name=COLLECTION,
    vectors_config={"size":EMBEDDING_DIMENSIONALITY, "distance": "Cosine"}
)

2025-10-06 15:35:31,837 - INFO - HTTP Request: DELETE http://localhost:6333/collections/labor_law_ar "HTTP/1.1 200 OK"
2025-10-06 15:35:33,065 - INFO - HTTP Request: PUT http://localhost:6333/collections/labor_law_ar "HTTP/1.1 200 OK"


True

In [18]:
vector_store = QdrantVectorStore(client=qdrant_client, collection_name=COLLECTION)

2025-10-06 15:36:00,526 - INFO - HTTP Request: GET http://localhost:6333/collections/labor_law_ar/exists "HTTP/1.1 200 OK"
2025-10-06 15:36:00,555 - INFO - HTTP Request: GET http://localhost:6333/collections/labor_law_ar "HTTP/1.1 200 OK"


In [19]:
### ---------- STEP 4: Create index ----------

In [20]:
storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [25]:
raw_docs[35]

Document(id_='8507b7cd-1f5c-4a32-bd1a-a88f056e69ae', embedding=None, metadata={'page_label': '36', 'file_name': 'labor_law_ar.pdf', 'file_path': 'c:\\Users\\zaina\\OneDrive\\llm-zoomcamp\\llm-zoomcamp\\hr_assistant_LlamaIndex\\data\\policies\\labor_law_ar.pdf', 'file_type': 'application/pdf', 'file_size': 1141447, 'creation_date': '2025-10-06', 'last_modified_date': '2025-10-03'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text=' 33 \nلصفلا  عبارلا  تازاجلإا  ةداملا  ةعساتلا  دعب  ةئاملا  :  1. قحتــسي  لــماعلا  نــع  لــك  ماــع  ةزاــجإ  ةيونــس  لا  لــقت  اهتدــم  نــع  دــحاو  نيرــشعو  يومــاً ، تــُزاد إلــى مــدة لا تقــل عــن ثلاث

In [26]:
index = VectorStoreIndex.from_documents(raw_docs, vector_store=vector_store,storage_context=storage_context)

2025-10-06 15:45:41,387 - INFO - HTTP Request: PUT http://localhost:6333/collections/labor_law_ar/points?wait=true "HTTP/1.1 200 OK"
2025-10-06 15:45:41,693 - INFO - HTTP Request: PUT http://localhost:6333/collections/labor_law_ar/points?wait=true "HTTP/1.1 200 OK"
2025-10-06 15:45:41,897 - INFO - HTTP Request: PUT http://localhost:6333/collections/labor_law_ar/points?wait=true "HTTP/1.1 200 OK"
2025-10-06 15:45:42,092 - INFO - HTTP Request: PUT http://localhost:6333/collections/labor_law_ar/points?wait=true "HTTP/1.1 200 OK"
2025-10-06 15:45:42,296 - INFO - HTTP Request: PUT http://localhost:6333/collections/labor_law_ar/points?wait=true "HTTP/1.1 200 OK"
2025-10-06 15:45:42,539 - INFO - HTTP Request: PUT http://localhost:6333/collections/labor_law_ar/points?wait=true "HTTP/1.1 200 OK"
2025-10-06 15:45:42,870 - INFO - HTTP Request: PUT http://localhost:6333/collections/labor_law_ar/points?wait=true "HTTP/1.1 200 OK"
2025-10-06 15:45:43,168 - INFO - HTTP Request: PUT http://localhost:6

In [27]:
# ---------- STEP 5: Query with service filter ----------

In [28]:
# Create retriever from your index
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=3  # number of chunks to return
)

In [38]:
results = retriever.retrieve("المادة 45")

2025-10-06 15:58:48,685 - INFO - HTTP Request: POST http://localhost:6333/collections/labor_law_ar/points/search "HTTP/1.1 200 OK"


In [39]:
for r in results:
    print("Score:", r.score)
    print("Text:", r.node.text[:300], "...\n")

Score: 0.8359781
Text: 46 
      10 بابلا  رشاعلا   ليغشت  ثادحلأا ...

Score: 0.8312733
Text: 49- عدلت هذه المادة بناءً على المرسوم الملكي رقم )م/46( خيراتو  5/6/1436هـ، وعدلت هذه المادة بناءً على المرسوم يكلملا  مقر ) م /44( خيراتب  8/2/1446ـه .  50-تلدع  هذه  ةداملا  موسرملاب  يكلملا  مقر ) م/46( خيراتو  5/6/1436ـه . ...

Score: 0.8291223
Text: 19- ﺖﻟﺪﻋ  هﺬﻫ  ةدﺎﳌا  ﺑﻨﺎءً1 ﻋلى اﳌﺮﺳﻮم اﳌﻠكى:1 رﻗﻢ )م /44( ﺦﻳرﺎﺘﺑ  8/2/1446ـﻫ 20- ﻋﺪﻟﺖ ﻫﺬه اﳌﺎدة ﺑﻨﺎءً1 ﻋلى اﳌﺮﺳﻮم اﳌﻠكى:1 رﻗﻢ )م/24( ﺦﻳرﺎﺗو  12/5/1434ـﻫ، وﺖﻟﺪﻋ  هﺬﻫ  ةدﺎﳌا  ﺑﻨﺎءً1 ﻋلى اﳌﺮﺳﻮم اﳌﻠكى:1 رﻗﻢ )م /44( ﺦﻳرﺎﺘﺑ  8/2/1446ـﻫ  . 21- ﺖﻟﺪﻋ  هﺬﻫ  ةدﺎﳌا  ﺑﻨﺎءً1 ﻋلى اﳌﺮﺳﻮم اﳌﻠكى:1 رﻗﻢ )م /44( ﺦﻳرﺎﺘ ...



In [41]:
# ---------- STEP 5: Query with service filter ----------
response = retriever.retrieve("سياسة الإجازات")

print("\n--- RESPONSE ---")
print(response)

2025-10-06 18:25:10,991 - INFO - HTTP Request: POST http://localhost:6333/collections/labor_law_ar/points/search "HTTP/1.1 200 OK"



--- RESPONSE ---
[NodeWithScore(node=TextNode(id_='ba97df73-9c44-49eb-a083-5869c87f8456', embedding=None, metadata={'page_label': '36', 'file_name': 'New-Labour-Law-in-Saudi-Arabia-pdf.pdf', 'file_path': 'c:\\Users\\zaina\\OneDrive\\llm-zoomcamp\\llm-zoomcamp\\hr_assistant_LlamaIndex\\data\\policies\\New-Labour-Law-in-Saudi-Arabia-pdf.pdf', 'file_type': 'application/pdf', 'file_size': 1113983, 'creation_date': '2025-09-30', 'last_modified_date': '2025-09-24'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='6a53e5e9-b3ac-4178-85a9-1982fa9af012', node_type='4', metadata={'page_label': '36', 'file_name': 'New-Labour-Law-in-Saudi-Arabia-pdf.pdf', 'file_path': 'c:\\Users\\zaina\\OneDrive\\llm-zoomcamp\\llm-zoo

In [None]:
len(tagged_docs)

In [None]:
# Arabic query
results_ar = retriever.retrieve("ما هي أيام الإجازة السنوية بعد ٣ سنوات من الخدمة؟")
print("\n--- Arabic Query Results ---")
for r in results_ar:
    print("Score:", r.score)
    print("Text:", r.node.text[:200], "...\n")

In [None]:
# Arabic query
results_ar = retriever.retrieve("العمل الإضافي")
print("\n--- Arabic Query Results ---")
for r in results_ar:
    print("Score:", r.score)
    print("Text:", r.node.text, "...\n")