In [23]:
import os
import re
import json
import fitz  # PyMuPDF
import numpy as np
import requests
from transformers import MarianTokenizer, MarianMTModel
from tqdm import tqdm

In [3]:
# ---------- CONFIG ----------
DATA_DIR = "data/labor_law"
MAX_TOKENS = 512  # MarianMT maximum token limit (safe upper bound)
AR_PDF = os.path.join(DATA_DIR, "labor_law_ar.pdf")
AR_URL = "https://www.hrsd.gov.sa/sites/default/files/2025-07/nzam-al-ml----wfq-alhwyt-aljdydt-2.pdf"
JSON_PATH = os.path.join(DATA_DIR, "labor_law_parsed.json")
os.makedirs(DATA_DIR, exist_ok=True)

In [4]:
# -----------------------------
# Arabic Number Conversion (up to 999)
# -----------------------------

UNITS = {
    "ÿµŸÅÿ±": 0, "Ÿàÿßÿ≠ÿØ": 1, "Ÿàÿßÿ≠ÿØÿ©": 1, "ÿ£ŸàŸÑ": 1, "ÿßŸÑÿ£ŸàŸÑ": 1, "ÿßŸÑÿ£ŸàŸÑŸâ": 1, "ÿßŸÑÿ≠ÿßÿØŸäÿ©" : 1,
    "ÿßÿ´ŸÜÿßŸÜ": 2, "ÿßÿ´ŸÜŸäŸÜ": 2, "ÿ•ÿ´ŸÜÿßŸÜ": 2, "ÿßŸÑÿ´ÿßŸÜŸä": 2, "ÿßŸÑÿ´ÿßŸÜŸäÿ©": 2,
    "ÿ´ŸÑÿßÿ´ÿ©": 3, "ÿßŸÑÿ´ÿßŸÑÿ´": 3, "ÿßŸÑÿ´ÿßŸÑÿ´ÿ©": 3,
    "ÿ£ÿ±ÿ®ÿπÿ©": 4, "ÿßŸÑÿ±ÿßÿ®ÿπ": 4, "ÿßŸÑÿ±ÿßÿ®ÿπÿ©": 4,
    "ÿÆŸÖÿ≥ÿ©": 5, "ÿßŸÑÿÆÿßŸÖÿ≥": 5, "ÿßŸÑÿÆÿßŸÖÿ≥ÿ©": 5,
    "ÿ≥ÿ™ÿ©": 6, "ÿßŸÑÿ≥ÿßÿØÿ≥": 6, "ÿßŸÑÿ≥ÿßÿØÿ≥ÿ©": 6,
    "ÿ≥ÿ®ÿπÿ©": 7, "ÿßŸÑÿ≥ÿßÿ®ÿπ": 7, "ÿßŸÑÿ≥ÿßÿ®ÿπÿ©": 7,
    "ÿ´ŸÖÿßŸÜŸäÿ©": 8, "ÿßŸÑÿ´ÿßŸÖŸÜ": 8, "ÿßŸÑÿ´ÿßŸÖŸÜÿ©": 8,
    "ÿ™ÿ≥ÿπÿ©": 9, "ÿßŸÑÿ™ÿßÿ≥ÿπ": 9, "ÿßŸÑÿ™ÿßÿ≥ÿπÿ©": 9,
    "ÿπÿ¥ÿ±ÿ©": 10, "ÿßŸÑÿπÿßÿ¥ÿ±": 10, "ÿßŸÑÿπÿßÿ¥ÿ±ÿ©": 10,
    "ÿ•ÿ≠ÿØŸâ ÿπÿ¥ÿ±ÿ©": 11, "ÿßŸÑÿ≠ÿßÿØŸä ÿπÿ¥ÿ±": 11, "ÿßŸÑÿ≠ÿßÿØŸäÿ© ÿπÿ¥ÿ±ÿ©": 11,
    "ÿßÿ´ŸÜÿß ÿπÿ¥ÿ±": 12, "ÿßŸÑÿ´ÿßŸÜŸä ÿπÿ¥ÿ±": 12, "ÿßŸÑÿ´ÿßŸÜŸäÿ© ÿπÿ¥ÿ±ÿ©": 12,
    "ÿ´ŸÑÿßÿ´ÿ© ÿπÿ¥ÿ±": 13, "ÿßŸÑÿ´ÿßŸÑÿ´ ÿπÿ¥ÿ±": 13,
    "ÿ£ÿ±ÿ®ÿπÿ© ÿπÿ¥ÿ±": 14, "ÿßŸÑÿ±ÿßÿ®ÿπ ÿπÿ¥ÿ±": 14,
    "ÿÆŸÖÿ≥ÿ© ÿπÿ¥ÿ±": 15, "ÿßŸÑÿÆÿßŸÖÿ≥ ÿπÿ¥ÿ±": 15,
    "ÿ≥ÿ™ÿ© ÿπÿ¥ÿ±": 16, "ÿßŸÑÿ≥ÿßÿØÿ≥ ÿπÿ¥ÿ±": 16,
    "ÿ≥ÿ®ÿπÿ© ÿπÿ¥ÿ±": 17, "ÿßŸÑÿ≥ÿßÿ®ÿπ ÿπÿ¥ÿ±": 17,
    "ÿ´ŸÖÿßŸÜŸäÿ© ÿπÿ¥ÿ±": 18, "ÿßŸÑÿ´ÿßŸÖŸÜ ÿπÿ¥ÿ±": 18,
    "ÿ™ÿ≥ÿπÿ© ÿπÿ¥ÿ±": 19, "ÿßŸÑÿ™ÿßÿ≥ÿπ ÿπÿ¥ÿ±": 19
}

TENS = {
    "ÿπÿ¥ÿ±ŸàŸÜ": 20, "ÿßŸÑÿπÿ¥ÿ±ŸàŸÜ": 20,"ÿπÿ¥ÿ±ŸäŸÜ": 20, "ÿßŸÑÿπÿ¥ÿ±ŸäŸÜ": 20,
    "ÿ´ŸÑÿßÿ´ŸàŸÜ": 30, "ÿßŸÑÿ´ŸÑÿßÿ´ŸàŸÜ" : 30, "ÿ´ŸÑÿßÿ´ŸäŸÜ": 30, "ÿßŸÑÿ´ŸÑÿßÿ´ŸäŸÜ" : 30,
    "ÿ£ÿ±ÿ®ÿπŸàŸÜ": 40, "ÿßŸÑÿ£ÿ±ÿ®ÿπŸàŸÜ": 40,"ÿ£ÿ±ÿ®ÿπŸäŸÜ": 40, "ÿßŸÑÿ£ÿ±ÿ®ÿπŸäŸÜ": 40,
    "ÿÆŸÖÿ≥ŸàŸÜ": 50,"ÿßŸÑÿÆŸÖÿ≥ŸàŸÜ" : 50,"ÿÆŸÖÿ≥ŸäŸÜ": 50,"ÿßŸÑÿÆŸÖÿ≥ŸäŸÜ" : 50,
    "ÿ≥ÿ™ŸàŸÜ": 60, "ÿßŸÑÿ≥ÿ™ŸàŸÜ": 60,"ÿ≥ÿ™ŸäŸÜ": 60, "ÿßŸÑÿ≥ÿ™ŸäŸÜ": 60,
    "ÿ≥ÿ®ÿπŸàŸÜ": 70, "ÿßŸÑÿ≥ÿ®ÿπŸàŸÜ": 70,"ÿ≥ÿ®ÿπŸäŸÜ": 70, "ÿßŸÑÿ≥ÿ®ÿπŸäŸÜ": 70,
    "ÿ´ŸÖÿßŸÜŸàŸÜ": 80, "ÿßŸÑÿ´ŸÖÿßŸÜŸàŸÜ": 80,"ÿ´ŸÖÿßŸÜŸäŸÜ": 80, "ÿßŸÑÿ´ŸÖÿßŸÜŸäŸÜ": 80,
    "ÿ™ÿ≥ÿπŸàŸÜ": 90 , "ÿßŸÑÿ™ÿ≥ÿπŸàŸÜ" : 90,"ÿ™ÿ≥ÿπŸäŸÜ": 90 , "ÿßŸÑÿ™ÿ≥ÿπŸäŸÜ" : 90
}

HUNDREDS = {
    "ŸÖÿßÿ¶ÿ©": 100, "ÿßŸÑŸÖÿßÿ¶ÿ©": 100,
    "ŸÖÿßÿ¶ÿ™ÿßŸÜ": 200, "ÿßŸÑŸÖÿßÿ¶ÿ™ŸäŸÜ": 200,"ÿßŸÑŸÖÿ¶ÿ™ÿßŸÜ":200,
    "ÿ´ŸÑÿßÿ´ŸÖÿßÿ¶ÿ©": 300, "ÿ£ÿ±ÿ®ÿπŸÖÿßÿ¶ÿ©": 400,
    "ÿÆŸÖÿ≥ŸÖÿßÿ¶ÿ©": 500, "ÿ≥ÿ™ŸÖÿßÿ¶ÿ©": 600,
    "ÿ≥ÿ®ÿπŸÖÿßÿ¶ÿ©": 700, "ÿ´ŸÖÿßŸÜŸÖÿßÿ¶ÿ©": 800,
    "ÿ™ÿ≥ÿπŸÖÿßÿ¶ÿ©": 900
}

In [5]:
def arabic_text_to_number(text: str) -> int:
    text = text.replace(" Ÿà", " ").strip()
    total = 0
    if "ÿ®ÿπÿØ" in text:
        left, right = text.split("ÿ®ÿπÿØ", 1)
        return arabic_text_to_number(right.strip()) + arabic_text_to_number(left.strip())
    for word in text.split():
        if word in HUNDREDS:
            total += HUNDREDS[word]
        elif word in TENS:
            total += TENS[word]
        elif word in UNITS:
            total += UNITS[word]
    return total

In [6]:
# ---------- STEP 1: Download Arabic PDF ----------
def download_pdf(url, path):
    if not os.path.exists(path):
        print(f"üì• Downloading {url} ...")
        r = requests.get(url)
        with open(path, "wb") as f:
            f.write(r.content)
        print("‚úÖ PDF downloaded")

In [7]:
download_pdf(AR_URL, AR_PDF)

In [8]:
# ---------- STEP 2: Improved Text Extraction ----------
def extract_text_by_page(pdf_path):
    """Use PyMuPDF blocks extraction to keep Arabic order stable."""
    doc = fitz.open(pdf_path)
    texts = []
    for page in doc:
        blocks = page.get_text("blocks")
        blocks.sort(key=lambda b: (b[1], -b[0]))  # sort top-to-bottom, right-to-left
        page_text = "\n".join(b[4] for b in blocks)
        texts.append(page_text)
    return "\n".join(texts)

In [9]:
ar_text = extract_text_by_page(AR_PDF)

In [24]:
# ------------------------------------------------------
# 2Ô∏è‚É£  Split Level 3 ‚Äî Articles (lowest level)
# ------------------------------------------------------

In [25]:
# ---------- STEP 4: Split Articles ----------
def split_articles(text: str):
    # Start-of-line ^ + 'ÿßŸÑŸÖÿßÿØÿ© ... :'   (non-greedy body)   until next start-of-line 'ÿßŸÑŸÖÿßÿØÿ© ... :' or EOF
    pattern = r"^\s*(ÿßŸÑŸÖÿßÿØÿ©\s+([^:\n]{1,80})\s*:\s*(.*?))(?=^\s*ÿßŸÑŸÖÿßÿØÿ©\s+[^:\n]{1,80}\s*:|\Z)"
    matches = re.findall(pattern, text, flags=re.DOTALL | re.MULTILINE)

    articles = []
    for idx, match in enumerate(matches, start=1):
        full_match, number_text, article_content = match
        number_text = number_text.strip()
        article_content = article_content.strip()
        number_value = arabic_text_to_number(number_text)
        articles.append({
            "article_number": number_value,
            "arabic_name": f"ÿßŸÑŸÖÿßÿØÿ© {number_text}",
            "number_ar": number_text,
            "arabic_content": article_content
        })
    return articles

In [26]:

# ------------------------------------------------------
# 3Ô∏è‚É£  Split Level 2 ‚Äî Chapters (each chapter contains articles)
# ------------------------------------------------------

In [27]:
# ---------- STEP 5: Split Chapters ----------
def split_chapters(text: str):
    chapter_pattern = r"(ÿßŸÑŸÅÿµŸÑ\s+[^\n]{1,30})\s*\n\s*([^\n]*)"
    matches = list(re.finditer(chapter_pattern, text))
    chapters = []
    if not matches:
        chapters.append({
            "chapter_title_ar": "ÿ®ÿØŸàŸÜ ŸÅÿµŸÑ",
            "chapter_number_ar" : None,
            "chapter_number": None,
            "text": text
        })
        return chapters
    for i, match in enumerate(matches):
        start = match.start()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        chapter_header = match.group(1).strip()
        chapter_title = match.group(2).strip()
        chapter_number = chapter_header.replace("ÿßŸÑŸÅÿµŸÑ", "").strip()
        number_value = arabic_text_to_number(chapter_number)
        chapter_text = text[start:end]
        chapters.append({
            "chapter_title_ar": chapter_title,
            "chapter_number_ar": chapter_number,
            "chapter_number": number_value,
            "text": chapter_text
        })
    return chapters

In [28]:
# ------------------------------------------------------
# 4Ô∏è‚É£  Split Level 1 ‚Äî Parts (each part contains chapters)
# ------------------------------------------------------

In [29]:
# ---------- STEP 6: Split Parts ----------
def split_parts(text: str):
    part_pattern = r"(ÿßŸÑÿ®ÿßÿ®\s+[^\n]{1,30})\s*\n\s*([^\n]*)"
    matches = list(re.finditer(part_pattern, text))
    parts = []
    if not matches:
        parts.append({
            "part_title_ar": "ÿßŸÑÿ¨ÿ≤ÿ° ÿßŸÑÿπÿßŸÖ",
            "part_number_ar": None,
            "part_number": None,
            "text": text
        })
        return parts
    for i, match in enumerate(matches):
        start = match.start()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        part_header = match.group(1).strip()
        part_title = match.group(2).strip()
        part_number = part_header.replace("ÿßŸÑÿ®ÿßÿ®", "").strip()
        number_value = arabic_text_to_number(part_number)
        part_text = text[start:end]
        parts.append({
            "part_title_ar": part_title,
            "part_number_ar": part_number,
            "part_number": number_value,
            "text": part_text
        })
    return parts

In [30]:
# ------------------------------------------------------
# 5Ô∏è‚É£  Main Aggregation ‚Äî Combine all levels
# ------------------------------------------------------

In [31]:
# ---------- STEP 7: Combine All ----------
def extract_structure(text: str):
    result = []
    parts = split_parts(text)
    index = 0
    for part in parts:
        part_title_ar = part["part_title_ar"]
        part_number_ar = part["part_number_ar"]
        part_number = part["part_number"]
        chapters = split_chapters(part["text"])
        
        for chapter in chapters:
            chapter_title_ar = chapter["chapter_title_ar"]
            chapter_number_ar = chapter["chapter_number_ar"]
            chapter_number = chapter["chapter_number"]
            articles = split_articles(chapter["text"])
            for art in articles:
                index +=1
                result.append({
                    "index": index,
                    "part_title_ar": part_title_ar,
                    "part_number_ar": part_number_ar,
                    "part_number": part_number,
                    "chapter_title_ar": chapter_title_ar,
                    "chapter_number_ar": chapter_number_ar,
                    "chapter_number": chapter_number,
                    "arabic_name": art["arabic_name"],
                    "article_number": art["article_number"],
                    "number_ar": art["number_ar"],
                    "arabic_content": art["arabic_content"]
                })
    return result

In [50]:
print("üîç Parsing structure ...")
articles_data = extract_structure(ar_text)
print(f"‚úÖ Articles detected: {len(articles_data)}")

üîç Parsing structure ...
‚úÖ Articles detected: 249


In [34]:
# ------------------------------------------------------
# 7Ô∏è‚É£ Translation setup (your better version)
# ------------------------------------------------------
print("üåê Loading translation model...")
model_name = "Helsinki-NLP/opus-mt-ar-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
translator = MarianMTModel.from_pretrained(model_name)

üåê Loading translation model...


In [35]:
def translate_short(text):
    """Translate text safely within model's token limit."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=MAX_TOKENS)
    translated = translator.generate(**inputs, max_length=MAX_TOKENS)
    return tokenizer.decode(translated[0], skip_special_tokens=True)


def split_long_by_tokens(text, max_tokens=MAX_TOKENS):
    """Split a long text into smaller pieces based on tokens."""
    tokens = tokenizer(text).input_ids
    if len(tokens) <= max_tokens:
        return [text]
    
    # Reconstruct segments from tokens
    segments = []
    start = 0
    while start < len(tokens):
        end = min(start + max_tokens, len(tokens))
        segment_text = tokenizer.decode(tokens[start:end], skip_special_tokens=True)
        segments.append(segment_text)
        start = end
    return segments

def translate_long(text, max_tokens=MAX_TOKENS):
    """Split text by paragraphs, then by tokens if needed, and translate safely."""
    paragraphs = [p.strip() for p in re.split(r'\n{2,}', text) if p.strip()]
    chunks = []
    current_chunk = []
    current_len = 0

    for p in paragraphs:
        p_tokens = tokenizer(p).input_ids
        if len(p_tokens) > max_tokens:
            # Paragraph itself too long ‚Üí split further
            for seg in split_long_by_tokens(p, max_tokens):
                chunks.append(seg)
            continue

        if current_len + len(p_tokens) > max_tokens:
            chunks.append("\n\n".join(current_chunk))
            current_chunk = [p]
            current_len = len(p_tokens)
        else:
            current_chunk.append(p)
            current_len += len(p_tokens)

    if current_chunk:
        chunks.append("\n\n".join(current_chunk))

    translated_parts = []
    for chunk in tqdm(chunks, desc="Translating", ncols=90):
        translated_parts.append(translate_short(chunk))

    return "\n\n".join(translated_parts)

In [36]:
# ------------------------------------------------------
# 8Ô∏è‚É£ Translate all Arabic articles
# ------------------------------------------------------
print("üöÄ Translating articles...")
for art in tqdm(articles_data, desc="Translating"):
    art["english_content"] = translate_long(art["arabic_content"])
    art["english_number"] = f'Article {art["article_number"]}'

üöÄ Translating articles...


Translating:   0%|                                                                             | 0/249 [00:00<?, ?it/s]
Translating:   0%|                                                  | 0/1 [00:00<?, ?it/s][A
Translating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:09<00:00,  9.41s/it][A
Translating:   0%|‚ñé                                                                    | 1/249 [00:09<38:59,  9.44s/it]
Translating:   0%|                                                  | 0/2 [00:00<?, ?it/s][A
Translating:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà                     | 1/2 [00:43<00:43, 43.32s/it][A
Translating: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [02:01<00:00, 60.62s/it][A
Translating:   1%|‚ñå                                                         

In [49]:
# ---------- STEP 8: Save JSON (Arabic readable) ----------
with open(JSON_PATH, "w", encoding="utf-8") as f:
    json.dump(articles_data, f, ensure_ascii=False, indent=2)
print(f"‚úÖ Saved parsed data ‚Üí {JSON_PATH} ({len(articles_data)} articles)")

‚úÖ Saved parsed data ‚Üí data/labor_law\labor_law_parsed.json (249 articles)


In [51]:
articles_data

[{'index': 1,
  'part_title_ar': 'ÿßŸÑÿ™ÿπÿ±ŸäŸÅÿßÿ™ ŸàÿßŸÑÿ£ÿ≠ŸÉÿßŸÖ ÿßŸÑÿπÿßŸÖÿ©',
  'part_number_ar': 'ÿßŸÑÿ£ŸàŸÑ',
  'part_number': 1,
  'chapter_title_ar': 'ÿßŸÑÿ™ÿπÿ±ŸäŸÅÿßÿ™',
  'chapter_number_ar': 'ÿßŸÑÿ£ŸàŸÑ',
  'chapter_number': 1,
  'arabic_name': 'ÿßŸÑŸÖÿßÿØÿ© ÿßŸÑÿ£ŸàŸÑŸâ',
  'article_number': 1,
  'number_ar': 'ÿßŸÑÿ£ŸàŸÑŸâ',
  'arabic_content': 'Ÿäÿ≥ŸÖŸâ Ÿáÿ∞ÿß ÿßŸÑŸÜÿ∏ÿßŸÖ ŸÜÿ∏ÿßŸÖ ÿßŸÑÿπŸÖŸÑ.'},
 {'index': 2,
  'part_title_ar': 'ÿßŸÑÿ™ÿπÿ±ŸäŸÅÿßÿ™ ŸàÿßŸÑÿ£ÿ≠ŸÉÿßŸÖ ÿßŸÑÿπÿßŸÖÿ©',
  'part_number_ar': 'ÿßŸÑÿ£ŸàŸÑ',
  'part_number': 1,
  'chapter_title_ar': 'ÿßŸÑÿ™ÿπÿ±ŸäŸÅÿßÿ™',
  'chapter_number_ar': 'ÿßŸÑÿ£ŸàŸÑ',
  'chapter_number': 1,
  'arabic_name': 'ÿßŸÑŸÖÿßÿØÿ© ÿßŸÑÿ´ÿßŸÜŸäÿ©',
  'article_number': 2,
  'number_ar': 'ÿßŸÑÿ´ÿßŸÜŸäÿ©',
  'arabic_content': '1 \n \nŸäŸÇÿµÿØ ÿ®ÿßŸÑÿ£ŸÑŸÅÿßÿ∏ ŸàÿßŸÑÿπÿ®ÿßÿ±ÿßÿ™ ÿßŸÑÿ¢ÿ™Ÿäÿ© ‚Äì ÿ£ŸäŸÜŸÖÿß Ÿàÿ±ÿØÿ™ ŸÅŸä Ÿáÿ∞ÿß ÿßŸÑŸÜÿ∏ÿßŸÖ ‚Äì ÿßŸÑŸÖÿπÿßŸÜŸä ÿßŸÑŸÖÿ®ŸäŸÜÿ© ÿ£ŸÖÿßŸÖŸáÿß ŸÖÿß ŸÑŸÖ ŸäŸÇÿ™ÿ∂ ÿßŸÑÿ≥ŸäÿßŸÇ ÿÆŸÑÿßŸÅ ÿ∞ŸÑŸÉ:  \n\n