In [23]:
import os
import re
import json
import fitz  # PyMuPDF
import numpy as np
import requests
from transformers import MarianTokenizer, MarianMTModel
from tqdm import tqdm

In [3]:
# ---------- CONFIG ----------
DATA_DIR = "data/labor_law"
MAX_TOKENS = 512  # MarianMT maximum token limit (safe upper bound)
AR_PDF = os.path.join(DATA_DIR, "labor_law_ar.pdf")
AR_URL = "https://www.hrsd.gov.sa/sites/default/files/2025-07/nzam-al-ml----wfq-alhwyt-aljdydt-2.pdf"
JSON_PATH = os.path.join(DATA_DIR, "labor_law_parsed.json")
os.makedirs(DATA_DIR, exist_ok=True)

In [4]:
# -----------------------------
# Arabic Number Conversion (up to 999)
# -----------------------------

UNITS = {
    "صفر": 0, "واحد": 1, "واحدة": 1, "أول": 1, "الأول": 1, "الأولى": 1, "الحادية" : 1,
    "اثنان": 2, "اثنين": 2, "إثنان": 2, "الثاني": 2, "الثانية": 2,
    "ثلاثة": 3, "الثالث": 3, "الثالثة": 3,
    "أربعة": 4, "الرابع": 4, "الرابعة": 4,
    "خمسة": 5, "الخامس": 5, "الخامسة": 5,
    "ستة": 6, "السادس": 6, "السادسة": 6,
    "سبعة": 7, "السابع": 7, "السابعة": 7,
    "ثمانية": 8, "الثامن": 8, "الثامنة": 8,
    "تسعة": 9, "التاسع": 9, "التاسعة": 9,
    "عشرة": 10, "العاشر": 10, "العاشرة": 10,
    "إحدى عشرة": 11, "الحادي عشر": 11, "الحادية عشرة": 11,
    "اثنا عشر": 12, "الثاني عشر": 12, "الثانية عشرة": 12,
    "ثلاثة عشر": 13, "الثالث عشر": 13,
    "أربعة عشر": 14, "الرابع عشر": 14,
    "خمسة عشر": 15, "الخامس عشر": 15,
    "ستة عشر": 16, "السادس عشر": 16,
    "سبعة عشر": 17, "السابع عشر": 17,
    "ثمانية عشر": 18, "الثامن عشر": 18,
    "تسعة عشر": 19, "التاسع عشر": 19
}

TENS = {
    "عشرون": 20, "العشرون": 20,"عشرين": 20, "العشرين": 20,
    "ثلاثون": 30, "الثلاثون" : 30, "ثلاثين": 30, "الثلاثين" : 30,
    "أربعون": 40, "الأربعون": 40,"أربعين": 40, "الأربعين": 40,
    "خمسون": 50,"الخمسون" : 50,"خمسين": 50,"الخمسين" : 50,
    "ستون": 60, "الستون": 60,"ستين": 60, "الستين": 60,
    "سبعون": 70, "السبعون": 70,"سبعين": 70, "السبعين": 70,
    "ثمانون": 80, "الثمانون": 80,"ثمانين": 80, "الثمانين": 80,
    "تسعون": 90 , "التسعون" : 90,"تسعين": 90 , "التسعين" : 90
}

HUNDREDS = {
    "مائة": 100, "المائة": 100,
    "مائتان": 200, "المائتين": 200,"المئتان":200,
    "ثلاثمائة": 300, "أربعمائة": 400,
    "خمسمائة": 500, "ستمائة": 600,
    "سبعمائة": 700, "ثمانمائة": 800,
    "تسعمائة": 900
}

In [5]:
def arabic_text_to_number(text: str) -> int:
    text = text.replace(" و", " ").strip()
    total = 0
    if "بعد" in text:
        left, right = text.split("بعد", 1)
        return arabic_text_to_number(right.strip()) + arabic_text_to_number(left.strip())
    for word in text.split():
        if word in HUNDREDS:
            total += HUNDREDS[word]
        elif word in TENS:
            total += TENS[word]
        elif word in UNITS:
            total += UNITS[word]
    return total

In [6]:
# ---------- STEP 1: Download Arabic PDF ----------
def download_pdf(url, path):
    if not os.path.exists(path):
        print(f"📥 Downloading {url} ...")
        r = requests.get(url)
        with open(path, "wb") as f:
            f.write(r.content)
        print("✅ PDF downloaded")

In [7]:
download_pdf(AR_URL, AR_PDF)

In [8]:
# ---------- STEP 2: Improved Text Extraction ----------
def extract_text_by_page(pdf_path):
    """Use PyMuPDF blocks extraction to keep Arabic order stable."""
    doc = fitz.open(pdf_path)
    texts = []
    for page in doc:
        blocks = page.get_text("blocks")
        blocks.sort(key=lambda b: (b[1], -b[0]))  # sort top-to-bottom, right-to-left
        page_text = "\n".join(b[4] for b in blocks)
        texts.append(page_text)
    return "\n".join(texts)

In [9]:
ar_text = extract_text_by_page(AR_PDF)

In [24]:
# ------------------------------------------------------
# 2️⃣  Split Level 3 — Articles (lowest level)
# ------------------------------------------------------

In [25]:
# ---------- STEP 4: Split Articles ----------
def split_articles(text: str):
    # Start-of-line ^ + 'المادة ... :'   (non-greedy body)   until next start-of-line 'المادة ... :' or EOF
    pattern = r"^\s*(المادة\s+([^:\n]{1,80})\s*:\s*(.*?))(?=^\s*المادة\s+[^:\n]{1,80}\s*:|\Z)"
    matches = re.findall(pattern, text, flags=re.DOTALL | re.MULTILINE)

    articles = []
    for idx, match in enumerate(matches, start=1):
        full_match, number_text, article_content = match
        number_text = number_text.strip()
        article_content = article_content.strip()
        number_value = arabic_text_to_number(number_text)
        articles.append({
            "article_number": number_value,
            "arabic_name": f"المادة {number_text}",
            "number_ar": number_text,
            "arabic_content": article_content
        })
    return articles

In [26]:

# ------------------------------------------------------
# 3️⃣  Split Level 2 — Chapters (each chapter contains articles)
# ------------------------------------------------------

In [27]:
# ---------- STEP 5: Split Chapters ----------
def split_chapters(text: str):
    chapter_pattern = r"(الفصل\s+[^\n]{1,30})\s*\n\s*([^\n]*)"
    matches = list(re.finditer(chapter_pattern, text))
    chapters = []
    if not matches:
        chapters.append({
            "chapter_title_ar": "بدون فصل",
            "chapter_number_ar" : None,
            "chapter_number": None,
            "text": text
        })
        return chapters
    for i, match in enumerate(matches):
        start = match.start()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        chapter_header = match.group(1).strip()
        chapter_title = match.group(2).strip()
        chapter_number = chapter_header.replace("الفصل", "").strip()
        number_value = arabic_text_to_number(chapter_number)
        chapter_text = text[start:end]
        chapters.append({
            "chapter_title_ar": chapter_title,
            "chapter_number_ar": chapter_number,
            "chapter_number": number_value,
            "text": chapter_text
        })
    return chapters

In [28]:
# ------------------------------------------------------
# 4️⃣  Split Level 1 — Parts (each part contains chapters)
# ------------------------------------------------------

In [29]:
# ---------- STEP 6: Split Parts ----------
def split_parts(text: str):
    part_pattern = r"(الباب\s+[^\n]{1,30})\s*\n\s*([^\n]*)"
    matches = list(re.finditer(part_pattern, text))
    parts = []
    if not matches:
        parts.append({
            "part_title_ar": "الجزء العام",
            "part_number_ar": None,
            "part_number": None,
            "text": text
        })
        return parts
    for i, match in enumerate(matches):
        start = match.start()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        part_header = match.group(1).strip()
        part_title = match.group(2).strip()
        part_number = part_header.replace("الباب", "").strip()
        number_value = arabic_text_to_number(part_number)
        part_text = text[start:end]
        parts.append({
            "part_title_ar": part_title,
            "part_number_ar": part_number,
            "part_number": number_value,
            "text": part_text
        })
    return parts

In [30]:
# ------------------------------------------------------
# 5️⃣  Main Aggregation — Combine all levels
# ------------------------------------------------------

In [31]:
# ---------- STEP 7: Combine All ----------
def extract_structure(text: str):
    result = []
    parts = split_parts(text)
    index = 0
    for part in parts:
        part_title_ar = part["part_title_ar"]
        part_number_ar = part["part_number_ar"]
        part_number = part["part_number"]
        chapters = split_chapters(part["text"])
        
        for chapter in chapters:
            chapter_title_ar = chapter["chapter_title_ar"]
            chapter_number_ar = chapter["chapter_number_ar"]
            chapter_number = chapter["chapter_number"]
            articles = split_articles(chapter["text"])
            for art in articles:
                index +=1
                result.append({
                    "index": index,
                    "part_title_ar": part_title_ar,
                    "part_number_ar": part_number_ar,
                    "part_number": part_number,
                    "chapter_title_ar": chapter_title_ar,
                    "chapter_number_ar": chapter_number_ar,
                    "chapter_number": chapter_number,
                    "arabic_name": art["arabic_name"],
                    "article_number": art["article_number"],
                    "number_ar": art["number_ar"],
                    "arabic_content": art["arabic_content"]
                })
    return result

In [50]:
print("🔍 Parsing structure ...")
articles_data = extract_structure(ar_text)
print(f"✅ Articles detected: {len(articles_data)}")

🔍 Parsing structure ...
✅ Articles detected: 249


In [34]:
# ------------------------------------------------------
# 7️⃣ Translation setup (your better version)
# ------------------------------------------------------
print("🌐 Loading translation model...")
model_name = "Helsinki-NLP/opus-mt-ar-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
translator = MarianMTModel.from_pretrained(model_name)

🌐 Loading translation model...


In [35]:
def translate_short(text):
    """Translate text safely within model's token limit."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=MAX_TOKENS)
    translated = translator.generate(**inputs, max_length=MAX_TOKENS)
    return tokenizer.decode(translated[0], skip_special_tokens=True)


def split_long_by_tokens(text, max_tokens=MAX_TOKENS):
    """Split a long text into smaller pieces based on tokens."""
    tokens = tokenizer(text).input_ids
    if len(tokens) <= max_tokens:
        return [text]
    
    # Reconstruct segments from tokens
    segments = []
    start = 0
    while start < len(tokens):
        end = min(start + max_tokens, len(tokens))
        segment_text = tokenizer.decode(tokens[start:end], skip_special_tokens=True)
        segments.append(segment_text)
        start = end
    return segments

def translate_long(text, max_tokens=MAX_TOKENS):
    """Split text by paragraphs, then by tokens if needed, and translate safely."""
    paragraphs = [p.strip() for p in re.split(r'\n{2,}', text) if p.strip()]
    chunks = []
    current_chunk = []
    current_len = 0

    for p in paragraphs:
        p_tokens = tokenizer(p).input_ids
        if len(p_tokens) > max_tokens:
            # Paragraph itself too long → split further
            for seg in split_long_by_tokens(p, max_tokens):
                chunks.append(seg)
            continue

        if current_len + len(p_tokens) > max_tokens:
            chunks.append("\n\n".join(current_chunk))
            current_chunk = [p]
            current_len = len(p_tokens)
        else:
            current_chunk.append(p)
            current_len += len(p_tokens)

    if current_chunk:
        chunks.append("\n\n".join(current_chunk))

    translated_parts = []
    for chunk in tqdm(chunks, desc="Translating", ncols=90):
        translated_parts.append(translate_short(chunk))

    return "\n\n".join(translated_parts)

In [36]:
# ------------------------------------------------------
# 8️⃣ Translate all Arabic articles
# ------------------------------------------------------
print("🚀 Translating articles...")
for art in tqdm(articles_data, desc="Translating"):
    art["english_content"] = translate_long(art["arabic_content"])
    art["english_number"] = f'Article {art["article_number"]}'

🚀 Translating articles...


Translating:   0%|                                                                             | 0/249 [00:00<?, ?it/s]
Translating:   0%|                                                  | 0/1 [00:00<?, ?it/s][A
Translating: 100%|██████████████████████████████████████████| 1/1 [00:09<00:00,  9.41s/it][A
Translating:   0%|▎                                                                    | 1/249 [00:09<38:59,  9.44s/it]
Translating:   0%|                                                  | 0/2 [00:00<?, ?it/s][A
Translating:  50%|█████████████████████                     | 1/2 [00:43<00:43, 43.32s/it][A
Translating: 100%|██████████████████████████████████████████| 2/2 [02:01<00:00, 60.62s/it][A
Translating:   1%|▌                                                                  | 2/249 [02:10<5:09:44, 75.24s/it]
Translating:   0%|                                                  | 0/1 [00:00<?, ?it/s][A
Translating: 100%|██████████████████████████████████████████| 1/1 [00:06<00:

In [49]:
# ---------- STEP 8: Save JSON (Arabic readable) ----------
with open(JSON_PATH, "w", encoding="utf-8") as f:
    json.dump(articles_data, f, ensure_ascii=False, indent=2)
print(f"✅ Saved parsed data → {JSON_PATH} ({len(articles_data)} articles)")

✅ Saved parsed data → data/labor_law\labor_law_parsed.json (249 articles)


In [51]:
articles_data

[{'index': 1,
  'part_title_ar': 'التعريفات والأحكام العامة',
  'part_number_ar': 'الأول',
  'part_number': 1,
  'chapter_title_ar': 'التعريفات',
  'chapter_number_ar': 'الأول',
  'chapter_number': 1,
  'arabic_name': 'المادة الأولى',
  'article_number': 1,
  'number_ar': 'الأولى',
  'arabic_content': 'يسمى هذا النظام نظام العمل.'},
 {'index': 2,
  'part_title_ar': 'التعريفات والأحكام العامة',
  'part_number_ar': 'الأول',
  'part_number': 1,
  'chapter_title_ar': 'التعريفات',
  'chapter_number_ar': 'الأول',
  'chapter_number': 1,
  'arabic_name': 'المادة الثانية',
  'article_number': 2,
  'number_ar': 'الثانية',
  'arabic_content': '1 \n \nيقصد بالألفاظ والعبارات الآتية – أينما وردت في هذا النظام – المعاني المبينة أمامها ما لم يقتض السياق خلاف ذلك:  \n\n-الوزارة: وزارة الموارد البشرية والتنمية الاجتماعية.  \n\n-الوزير: وزير الموارد البشرية والتنمية الاجتماعية.  \n\n-مكتب العمل: الجهة الإدارية المنوط بها شؤون العمل في النطاق المكاني الذي يحدد بقرار من الوزير. \n\n-صاحب العمل: كل شخص طبي