In [None]:
import json
from tqdm import tqdm
import os
import pandas as pd

In [None]:
# ---------- CONFIG ----------
COLLECTION = "saudi_labor_law"
EMBEDDING_DIM = 768
DATA_DIR = "data/labor_law"
JSON_PATH = os.path.join(DATA_DIR, "labor_law_parsed.json")

In [None]:
# ---------- LOAD PROCESSED DATA ----------
articles = []
with open(JSON_PATH, "r", encoding="utf-8") as f:
    articles = json.load(f)

In [None]:
prompt_template = """
You act as HR professional, employer, or employee who want to ask about the Saudi Labor Law (نظام العمل السعودي) .
Formulate 5 questions that HR professional, employer, or employee might ask based on provided articles of 
Saudi Labor Law (نظام العمل السعودي) . The record should contain the answer to the questions which is the article, 
and the questions should be complete and not too short and in arabic.
If possible, use as fewer words as possible from the record. 
use only our proived articles

The record:

article number: {arabic_name}
article : {arabic_content}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [None]:
from openai import OpenAI
client = OpenAI()

In [None]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [None]:
results = {}
for doc in tqdm(articles): 
    doc_id = doc['index']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

In [None]:
parsed_resulst = {}

for doc_id, json_questions in results.items():
    parsed_resulst[doc_id] = json.loads(json_questions)

In [None]:
for doc_id, questions in parsed_resulst.items():
    if 'questions' in questions:
        parsed_resulst[doc_id] = questions['questions']

In [None]:
doc_index = {d['index']: d for d in articles}

In [None]:


final_results = []

for doc_id, questions in parsed_resulst.items():
    article_number = doc_index[doc_id]['article_number']
    arabic_content = doc_index[doc_id]['arabic_content']
    for q in questions:
        final_results.append((q, article_number,arabic_content, doc_id))



In [None]:
df = pd.DataFrame(final_results, columns=['question', 'article_number','article_orig', 'index'])

In [None]:
df.to_csv('data/ground-truth-data.csv', index=False)