### Extract procedues
Load the html file, and extract content while keeping the orignal strsucture:
- Parts (חלק)
- Chapters (פרק)
- Clauses (סעיפים)
- Subclauses (תתי סעיפים)
- Sub-subclauses (תת-תתי סעיפים)

Save the output as json and YAML

In [56]:
import json
import yaml
from bs4 import BeautifulSoup

# Read the HTML content
with open('takanon.htm', 'r', encoding='utf-8') as file:
    content = file.read()

soup = BeautifulSoup(content, 'html.parser')

# Initialize the nested structure
document_structure = []

# Function to extract sub-subclauses
def extract_sub_subclauses(subclause):
    sub_subclauses = []
    sub_subclause_divs = subclause.find_all_next('div', class_='law-number2')
    for sub_subclause in sub_subclause_divs:
        if sub_subclause.find_previous('div', class_='law-number1') != subclause:
            break
        sub_subclause_number = sub_subclause.get_text(strip=True).strip('()')
        sibling_div = sub_subclause.find_next_sibling('div', class_='law-content3')
        title = sibling_div.get_text(strip=True) if sibling_div else ''
        sub_subclause_content = {"sub_subclause": sub_subclause_number, "sub_subclause text": f"{title}"}
        sub_subclauses.append(sub_subclause_content)
    return sub_subclauses

# Function to extract subclauses
def extract_subclauses(clause):
    subclauses = []
    subclause_divs = clause.find_all_next('div', class_='law-number1')
    for subclause in subclause_divs:
        if subclause.find_previous('div', class_='law-number') != clause:
            break
        subclause_number = subclause.get_text(strip=True).strip('()')
        sibling_div = subclause.find_next_sibling('div', class_='law-content1')
        title = sibling_div.get_text(strip=True) if sibling_div else ''
        sub_subclauses = extract_sub_subclauses(subclause)
        subclause_content = {"subclause": subclause_number, "subclause text": f"{title}", "sub_subclauses": sub_subclauses}
        subclauses.append(subclause_content)
    return subclauses

# Extract all clauses and their subcontent
all_clauses = []
clause_divs = soup.find_all('div', class_='law-number')
for clause in clause_divs:
    clause_number = clause.get_text(strip=True).strip('.')
    sibling_div = clause.find_next_sibling('div', class_='law-desc')
    title = sibling_div.get_text(strip=True) if sibling_div else ''
    main_content_div = sibling_div.find_next_sibling('div', class_='law-main') if sibling_div else None
    main_content = main_content_div.get_text(strip=True) if main_content_div else ''
    
    # Handling the "(א)" conditions
    if main_content.startswith('(א)'):
        main_content = ''
    elif '(א)' in main_content:
        main_content = main_content.split('(א)')[0].strip()

    clause_content = {
        "clause": clause_number,
        "clause title": title,
        "clause text": main_content,
        "subclauses": extract_subclauses(clause)
    }
    all_clauses.append(clause_content)

# Assign clauses to parts and chapters based on their position in the HTML
def assign_clauses_to_structure():
    parts = soup.find_all('h1', class_='law-part')
    current_part = None
    current_chapter = None

    for element in soup.find_all(['h1', 'h2', 'div']):
        if element.name == 'h1' and 'law-part' in element.get('class', []):
            part_title = element.get_text(strip=True)
            current_part = {"title": part_title, "chapters": [], "clauses": []}
            document_structure.append(current_part)
            current_chapter = None
        elif element.name == 'h2' and 'law-section' in element.get('class', []):
            chapter_title = element.get_text(strip=True)
            current_chapter = {"title": chapter_title, "clauses": []}
            if current_part:
                current_part["chapters"].append(current_chapter)
        elif element.name == 'div' and 'law-number' in element.get('class', []):
            clause_number = element.get_text(strip=True).strip('.')
            for clause in all_clauses:
                if clause["clause"] == clause_number:
                    if current_chapter:
                        current_chapter["clauses"].append(clause)
                    elif current_part:
                        current_part["clauses"].append(clause)
                    break

assign_clauses_to_structure()

# Function to clean empty sub-subclauses, subclauses, clauses, and chapters
def clean_structure(structure):
    for part in structure:
        for chapter in part.get("chapters", []):
            chapter["clauses"] = [clause for clause in chapter.get("clauses", []) if clause["clause text"] or clause["subclauses"]]
            for clause in chapter["clauses"]:
                clause["subclauses"] = [subclause for subclause in clause.get("subclauses", []) if subclause["subclause text"] or subclause["sub_subclauses"]]
                for subclause in clause["subclauses"]:
                    subclause["sub_subclauses"] = [sub_subclause for sub_subclause in subclause.get("sub_subclauses", []) if sub_subclause["sub_subclause text"]]
        part["chapters"] = [chapter for chapter in part.get("chapters", []) if chapter["clauses"]]
        part["clauses"] = [clause for clause in part.get("clauses", []) if clause["clause text"] or clause["subclauses"]]
        for clause in part["clauses"]:
            clause["subclauses"] = [subclause for subclause in clause.get("subclauses", []) if subclause["subclause text"] or subclause["sub_subclauses"]]
            for subclause in clause["subclauses"]:
                subclause["sub_subclauses"] = [sub_subclause for sub_subclause in subclause.get("sub_subclauses", []) if sub_subclause["sub_subclause text"]]
    structure = [part for part in structure if part.get("chapters") or part.get("clauses")]
    return structure

# Clean the document structure
document_structure = clean_structure(document_structure)

# Save as JSON
with open('output/document_structure.json', 'w', encoding='utf-8') as json_file:
    json.dump(document_structure, json_file, ensure_ascii=False, indent=4)

# Save as YAML
with open('output/document_structure.yaml', 'w', encoding='utf-8') as yaml_file:
    yaml.dump(document_structure, yaml_file, allow_unicode=True, sort_keys=False)


### Split to specific clauses markdown files
Load the previous step json file, and split the content to md files, each file will contain a specific clause with all relevant details

In [61]:
import os
import json

# Create output directory if it doesn't exist
output_dir = 'output_markdown'
os.makedirs(output_dir, exist_ok=True)

# Load the JSON data
with open('output/document_structure.json', 'r', encoding='utf-8') as json_file:
    document_structure = json.load(json_file)

# Function to format clause details in Markdown
def format_clause_details_md(clause, ancestors):
    details = []
    for ancestor in ancestors:
        if "title" in ancestor:
            if "חלק" in ancestor["title"]:
                details.append(f'## {ancestor["title"]}')
            elif "פרק" in ancestor["title"]:
                details.append(f'### {ancestor["title"]}')

    if "clause" in clause:
        details.append(f'#### סעיף {clause["clause"]}')
    if "clause title" in clause:
        details.append(f'**{clause["clause title"]}**')
    if "clause text" in clause:
        details.append(clause["clause text"])

    if "subclauses" in clause and clause["subclauses"]:
        details.append('#### תתי-סעיפים:')
        for subclause in clause["subclauses"]:
            details.append(format_subclause_details_md(subclause))

    return '\n\n'.join(details)

# Function to format subclause details in Markdown
def format_subclause_details_md(subclause):
    details = []
    if "subclause" in subclause:
        details.append(f'##### תת-סעיף {subclause["subclause"]}')
    if "subclause text" in subclause:
        details.append(subclause["subclause text"])

    if "sub_subclauses" in subclause and subclause["sub_subclauses"]:
        for sub_subclause in subclause["sub_subclauses"]:
            details.append(format_sub_subclause_details_md(sub_subclause))

    return '\n\n'.join(details)

# Function to format sub-subclause details in Markdown
def format_sub_subclause_details_md(sub_subclause):
    details = []
    if "sub_subclause" in sub_subclause:
        details.append(f'###### תת-תת-סעיף {sub_subclause["sub_subclause"]}')
    if "sub_subclause text" in sub_subclause:
        details.append(sub_subclause["sub_subclause text"])

    return '\n\n'.join(details)

# Function to recursively generate the clauses' markdown files
def generate_clauses_markdown_files(structure, ancestors=[]):
    for part in structure:
        new_ancestors = ancestors + [part]
        for chapter in part.get("chapters", []):
            new_ancestors.append(chapter)
            for clause in chapter.get("clauses", []):
                clause_id = clause["clause"]
                file_content = format_clause_details_md(clause, new_ancestors)
                with open(f'{output_dir}/{clause_id}.md', 'w', encoding='utf-8') as file:
                    file.write(file_content + '\n\n----\n\n')
            new_ancestors.pop()
        for clause in part.get("clauses", []):
            clause_id = clause["clause"]
            file_content = format_clause_details_md(clause, new_ancestors)
            with open(f'{output_dir}/{clause_id}.md', 'w', encoding='utf-8') as file:
                file.write(file_content + '\n\n----\n\n')

# Generate the markdown files
generate_clauses_markdown_files(document_structure)

print("Markdown files have been generated in the 'output_markdown' folder.")


Markdown files have been generated in the 'output_markdown' folder.


In [62]:
import json

# Load the JSON data
with open('output/document_structure.json', 'r', encoding='utf-8') as json_file:
    document_structure = json.load(json_file)

# Function to format clause details in Markdown
def format_clause_details_md(clause, ancestors):
    details = []
    for ancestor in ancestors:
        if "title" in ancestor:
            if "חלק" in ancestor["title"]:
                details.append(f'## {ancestor["title"]}')
            elif "פרק" in ancestor["title"]:
                details.append(f'### {ancestor["title"]}')

    if "clause" in clause:
        details.append(f'#### סעיף {clause["clause"]}')
    if "clause title" in clause:
        details.append(f'**{clause["clause title"]}**')
    if "clause text" in clause:
        details.append(clause["clause text"])

    if "subclauses" in clause and clause["subclauses"]:
        details.append('#### תתי-סעיפים:')
        for subclause in clause["subclauses"]:
            details.append(format_subclause_details_md(subclause))

    return '\n\n'.join(details)

# Function to format subclause details in Markdown
def format_subclause_details_md(subclause):
    details = []
    if "subclause" in subclause:
        details.append(f'##### תת-סעיף {subclause["subclause"]}')
    if "subclause text" in subclause:
        details.append(subclause["subclause text"])

    if "sub_subclauses" in subclause and subclause["sub_subclauses"]:
        for sub_subclause in subclause["sub_subclauses"]:
            details.append(format_sub_subclause_details_md(sub_subclause))

    return '\n\n'.join(details)

# Function to format sub-subclause details in Markdown
def format_sub_subclause_details_md(sub_subclause):
    details = []
    if "sub_subclause" in sub_subclause:
        details.append(f'###### תת-תת-סעיף {sub_subclause["sub_subclause"]}')
    if "sub_subclause text" in sub_subclause:
        details.append(sub_subclause["sub_subclause text"])

    return '\n\n'.join(details)

# Function to recursively generate the full markdown content
def generate_full_markdown_content(structure, ancestors=[]):
    markdown_content = []
    for part in structure:
        new_ancestors = ancestors + [part]
        for chapter in part.get("chapters", []):
            new_ancestors.append(chapter)
            for clause in chapter.get("clauses", []):
                markdown_content.append(format_clause_details_md(clause, new_ancestors))
            new_ancestors.pop()
        for clause in part.get("clauses", []):
            markdown_content.append(format_clause_details_md(clause, new_ancestors))
    return '\n\n----\n\n'.join(markdown_content)

# Generate the full markdown content
full_markdown_content = generate_full_markdown_content(document_structure)

# Write the content to a single markdown file
with open('output/document_structure.md', 'w', encoding='utf-8') as file:
    file.write(full_markdown_content)

print("A single structured Markdown file has been generated: 'output/document_structure.md'")


A single structured Markdown file has been generated: 'output/document_structure.md'
