### Extract procedues
Load the html file, and extract content while keeping the orignal strsucture:
- Parts (חלק)
- Chapters (פרק)
- Clauses (סעיפים)
- Subclauses (תתי סעיפים)
- Sub-subclauses (תת-תתי סעיפים)

Save the output as json and YAML

In [34]:
import json
import yaml
from bs4 import BeautifulSoup

# Read the HTML content
with open('takanon.htm', 'r', encoding='utf-8') as file:
    content = file.read()

soup = BeautifulSoup(content, 'html.parser')

# Initialize the nested structure
document_structure = {}


# Extract Parts (חלק)
parts = soup.find_all('h1')
for part in parts:
    part_title = part.get_text(strip=True)
    document_structure[part_title] = {}
    current_part = document_structure[part_title]

    # Extract Chapters (פרק) within the part
    chapters = part.find_all_next('h2')
    if not chapters or chapters[0].find_previous('h1') != part:
        print(f"No chapters found under {part_title}")
        # No chapters found directly under this part, extract clauses directly under the part
        current_part["clauses"] = []
        clauses = part.find_all_next('div', class_='law-number')
        for clause in clauses:
            if clause.find_previous('h1') != part:
                break
            clause_number = clause.get_text(strip=True).strip('.')
            sibling_div = clause.find_next_sibling('div', class_='law-desc')
            title = sibling_div.get_text(strip=True) if sibling_div else ''

            # Extract main content for the clause
            main_content_div = sibling_div.find_next_sibling('div', class_='law-main') if sibling_div else None
            main_content = main_content_div.get_text(strip=True) if main_content_div else ''
            if main_content.startswith('(א)'):
                main_content = ''

            clause_content = {"clause": clause_number, "clause title": f"{title}", "clause text": f"{main_content}", "subclauses": []}
            current_part["clauses"].append(clause_content)

            # Extract Subclauses (תת-סעיף) within the clause
            subclauses = clause.find_all_next('div', class_='law-number1')
            for subclause in subclauses:
                if subclause.find_previous('div', class_='law-number') != clause:
                    break
                subclause_number = subclause.get_text(strip=True).strip('()')
                sibling_div = subclause.find_next_sibling('div', class_='law-content1')
                title = sibling_div.get_text(strip=True) if sibling_div else ''

                # Extract main content for the subclause
                main_content_div = sibling_div.find_next_sibling('div', class_='law-main') if sibling_div else None
                main_content = main_content_div.get_text(strip=True) if main_content_div else ''

                subclause_content = {"subclause": subclause_number, "subclause text": f"{title} {main_content}", "sub_subclauses": []}
                clause_content["subclauses"].append(subclause_content)

                # Extract Sub-subclauses (תת-תתסעיף) within the subclause
                sub_subclauses = subclause.find_all_next('div', class_='law-number2')
                for sub_subclause in sub_subclauses:
                    if sub_subclause.find_previous('div', class_='law-number1') != subclause:
                        break
                    sub_subclause_number = sub_subclause.get_text(strip=True).strip('()')
                    sibling_div = sub_subclause.find_next_sibling('div', class_='law-content3')
                    title = sibling_div.get_text(strip=True) if sibling_div else ''

                    # Extract main content for the sub-subclause
                    main_content_div = sibling_div.find_next_sibling('div', class_='law-main') if sibling_div else None
                    main_content = main_content_div.get_text(strip=True) if main_content_div else ''

                    sub_subclause_content = {"sub_subclause": sub_subclause_number, "sub_subclause text": f"{title} {main_content}"}
                    subclause_content["sub_subclauses"].append(sub_subclause_content)
    else:
        for chapter in chapters:
            if chapter.find_previous('h1') != part:
                break
            chapter_title = chapter.get_text(strip=True)
            current_part[chapter_title] = []
            current_chapter = current_part[chapter_title]

            # Extract Clauses (סעיף) within the chapter
            clauses = chapter.find_all_next('div', class_='law-number')
            for clause in clauses:
                if clause.find_previous('h2') != chapter and clause.find_previous('h1') != part:
                    break
                clause_number = clause.get_text(strip=True).strip('.')
                sibling_div = clause.find_next_sibling('div', class_='law-desc')
                title = sibling_div.get_text(strip=True) if sibling_div else ''

                # Extract main content for the clause
                main_content_div = sibling_div.find_next_sibling('div', class_='law-main') if sibling_div else None
                main_content = main_content_div.get_text(strip=True) if main_content_div else ''
                if main_content.startswith('(א)'):
                    main_content = ''

                clause_content = {"clause": clause_number, "clause text": f"{title}", "clause text": f"{main_content}", "subclauses": []}
                current_chapter.append(clause_content)

                # Extract Subclauses (תת-סעיף) within the clause
                subclauses = clause.find_all_next('div', class_='law-number1')
                for subclause in subclauses:
                    if subclause.find_previous('div', class_='law-number') != clause:
                        break
                    subclause_number = subclause.get_text(strip=True).strip('()')
                    sibling_div = subclause.find_next_sibling('div', class_='law-content1')
                    title = sibling_div.get_text(strip=True) if sibling_div else ''

                    # Extract main content for the subclause
                    main_content_div = sibling_div.find_next_sibling('div', class_='law-main') if sibling_div else None
                    main_content = main_content_div.get_text(strip=True) if main_content_div else ''

                    subclause_content = {"subclause": subclause_number, "subclause text": f"{title} {main_content}", "sub_subclauses": []}
                    clause_content["subclauses"].append(subclause_content)

                    # Extract Sub-subclauses (תת-תתסעיף) within the subclause
                    sub_subclauses = subclause.find_all_next('div', class_='law-number2')
                    for sub_subclause in sub_subclauses:
                        if sub_subclause.find_previous('div', class_='law-number1') != subclause:
                            break
                        sub_subclause_number = sub_subclause.get_text(strip=True).strip('()')
                        sibling_div = sub_subclause.find_next_sibling('div', class_='law-content3')
                        title = sibling_div.get_text(strip=True) if sibling_div else ''

                        # Extract main content for the sub-subclause
                        main_content_div = sibling_div.find_next_sibling('div', class_='law-main') if sibling_div else None
                        main_content = main_content_div.get_text(strip=True) if main_content_div else ''

                        sub_subclause_content = {"sub_subclause": sub_subclause_number, "sub_subclause text": f"{title} {main_content}"}
                        subclause_content["sub_subclauses"].append(sub_subclause_content)

# Handle misplaced clauses under "תוכן עניינים"
if "תוכן עניינים" in document_structure['תקנון הכנסת']:
    del document_structure["תקנון הכנסת"]["תוכן עניינים"]

# Save as JSON
with open('output/document_structure.json', 'w', encoding='utf-8') as json_file:
    json.dump(document_structure, json_file, ensure_ascii=False, indent=4)

# Save as YAML
with open('output/document_structure.yaml', 'w', encoding='utf-8') as yaml_file:
    yaml.dump(document_structure, yaml_file, allow_unicode=True, sort_keys=False)

### Split to specific clauses txt files
Load the previous step json file, and split the content to txt files, each file will contain a specific clause with all relevant details

In [10]:
import os
import json

# Load the document structure from the JSON file
with open('output/document_structure.json', 'r', encoding='utf-8') as json_file:
    document_structure = json.load(json_file)

# Function to ensure the output directory exists
def ensure_dir(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

# Function to create a filename based on part, chapter, and clause
def create_filename(part, chapter, clause_id):
    if chapter:
        return f"{part}_{chapter}_{clause_id}.txt".replace(" ", "_").replace(":", "").replace("?", "").replace("!", "")
    return f"{part}_{clause_id}.json".replace(" ", "_").replace(":", "").replace("?", "").replace("!", "")

# Ensure the output directory exists
output_dir = 'output'
ensure_dir(output_dir)

# Iterate over parts, chapters, and clauses to create the files
for part, chapters in document_structure.items():
    if "clauses" in chapters:  # Handle clauses directly under parts
        for clause in chapters["clauses"]:
            filename = create_filename(part, "", clause["clause"])
            filepath = os.path.join(output_dir, filename)
            clause_with_ancestors = {
                "part": part,
                "clause": clause["clause"],
                "clause text": clause["clause text"],
                "subclauses": clause["subclauses"]
            }
            with open(filepath, 'w', encoding='utf-8') as f:
                json.dump(clause_with_ancestors, f, ensure_ascii=False, indent=4)
    else:  # Handle clauses under chapters
        for chapter, clauses in chapters.items():
            for clause in clauses:
                filename = create_filename(part, chapter, clause["clause"])
                filepath = os.path.join(output_dir, filename)
                clause_with_ancestors = {
                    "part": part,
                    "chapter": chapter,
                    "clause": clause["clause"],
                    "clause text": clause["clause text"],
                    "subclauses": clause["subclauses"]
                }
                with open(filepath, 'w', encoding='utf-8') as f:
                    json.dump(clause_with_ancestors, f, ensure_ascii=False, indent=4)

print("Clauses have been split into separate TXT files.")


Clauses have been split into separate TXT files.


In [19]:
import json
import os

# Load the JSON data
with open('document_structure.json', 'r', encoding='utf-8') as json_file:
    document_structure = json.load(json_file)

# Create the output directory if it doesn't exist
output_dir = 'output'
os.makedirs(output_dir, exist_ok=True)

def write_clause_to_file(clause_id, clause_content, part_title=None, chapter_title=None):
    # Define the filename
    filename = os.path.join(output_dir, f"{clause_id}.txt")
    
    with open(filename, 'w', encoding='utf-8') as file:
        if part_title:
            file.write(f'"חלק": "{part_title}"\n\n')
        if chapter_title:
            file.write(f'"פרק": "{chapter_title}"\n\n')
        
        file.write(f'"סעיף": "{clause_id}"\n\n')
        file.write(f'"תוכן הסעיף": "{clause_content["clause text"]}"\n\n')
        
        if clause_content.get("subclauses"):
            file.write('"תתי-סעיפים":\n\n')
            for subclause in clause_content["subclauses"]:
                file.write(f'   - "תת-סעיף": "{subclause["subclause"]}"\n')
                file.write(f'     "תוכן תת-סעיף": "{subclause["subclause text"]}"\n\n')
                if subclause.get("sub_subclauses"):
                    file.write(f'     "תתי תת-סעיף":\n')
                    for sub_subclause in subclause["sub_subclauses"]:
                        file.write(f'       - "תת-תת-סעיף": "{sub_subclause["sub_subclause"]}"\n')
                        file.write(f'         "תוכן תת-תת-סעיף": "{sub_subclause["sub_subclause text"]}"\n\n')
        file.write("----\n")

# Traverse the JSON structure and write each clause to a file
for part_title, part_content in document_structure.items():
    for chapter_title, chapter_content in part_content.items():
        if chapter_title == "clauses":
            # Handle clauses directly under the part
            for clause_content in chapter_content:
                clause_id = clause_content["clause"]
                write_clause_to_file(clause_id, clause_content, part_title=part_title)
        else:
            # Handle clauses within chapters
            for clause_content in chapter_content:
                clause_id = clause_content["clause"]
                write_clause_to_file(clause_id, clause_content, part_title=part_title, chapter_title=chapter_title)
