### Extract procedues
Load the html file, and extract content while keeping the orignal strsucture:
- Parts (חלק)
- Chapters (פרק)
- Clauses (סעיפים)
- Subclauses (תתי סעיפים)
- Sub-subclauses (תת-תתי סעיפים)

Save the output as json and YAML

In [20]:
import json
import yaml
from bs4 import BeautifulSoup
import re

WS = re.compile(r'\s+', re.UNICODE | re.MULTILINE)

# Read the HTML content
with open('takanon.htm', 'r', encoding='utf-8') as file:
    content = file.read()

soup = BeautifulSoup(content, 'html.parser')

document_structure = []

def clean(text):
    return WS.sub(' ', text).strip()

def clean_el(el):
    text = el.get_text(strip=True).split('[תיקון')[0]
    return clean(text)

clause = None
part = None
chapters = None
clauses = None
subclauses = None
for el in soup.find_all(['div', 'h2', 'h1', 'h3'], recursive=True):
    cls = el.attrs.get('class') or []
    if 'law-part' in cls:
        title = clean_el(el)
        number, title = title.split(':', 1)
        part = dict(
            number=number,
            title=clean(title),
            chapters=[]
        )
        chapters = part['chapters']
        document_structure.append(part)
        chapter = None
        clauses = None
        # print('>>', part['title'])

    elif 'law-section' in cls:
        if chapters is None:
            print('*** No part for chapter *** {}'.format(el.get_text(strip=True),))
            continue
        number, title = clean_el(el).split(':', 1)
        chapter = dict(
            number=number,
            title=clean(title),
            clauses=[]
        )
        clauses = chapter['clauses']
        chapters.append(chapter)
        clause = None
        subclauses = None
        subclause = None
        # print('>>>>', chapter['title'])

    elif 'law-subsection' in cls:
        last_chapter = chapters[-1]
        subsection = el.get_text(strip=True)
        if last_chapter.get('subtitle') is None:
            last_chapter['subtitle'] = subsection
        else:
            new_chapter = dict(
                number=last_chapter['number'],
                title=last_chapter['title'],
                clauses=[]
            )
            new_chapter['subtitle'] = subsection
            chapters.append(new_chapter)
            clauses = new_chapter['clauses']
            clause = None
            subclauses = None
            subclause = None

    elif 'law-number' in cls:
        if clauses is None:
            print('*** No chapter for clause *** {}'.format(el.get_text(strip=True),))
            chapter = dict(
                number='',
                title='',
                clauses=[]
            )
            clauses = chapter['clauses']
            chapters.append(chapter)
            subclauses = None
            subclause = None
            
        clause = dict(
            number=clean_el(el).rstrip('.'),
            title=None,
            text=None,
            subclauses=[]
        )
        clauses.append(clause)
        subclauses = clause['subclauses']
        sub_subclauses = None
        sub_subclause = None
        # print('>>>>>>', clause['number'])

    elif 'law-desc' in cls:
        clause['title'] = clean_el(el)
    elif 'law-content' in cls:
        if clause:
            clause['text'] = clean_el(el)
    elif 'law-number1' in cls:
        if subclauses is None:
            print('*** No clause for subclause *** {}'.format(el.get_text(strip=True),))
            continue
        subclause = dict(
            number=clean_el(el),
            text=None,
            sub_subclauses=[]
        )
        subclauses.append(subclause)
        sub_subclauses = subclause['sub_subclauses']
        # print('>>>>>>>>', subclause['number'])

    elif 'law-content1' in cls:
        subclause['text'] = clean_el(el)
    elif 'law-number2' in cls:
        sub_subclause = dict(
            number=clean_el(el),
            text=None
        )
        sub_subclauses.append(sub_subclause)
        # print('>>>>>>>>>>', sub_subclause['number'])
    elif 'law-content3' in cls:
        sub_subclause['text'] = clean_el(el)

# Save as JSON
with open('output/document_structure.json', 'w', encoding='utf-8') as json_file:
    json.dump(document_structure, json_file, ensure_ascii=False, indent=4)

# Save as YAML
with open('output/document_structure.yaml', 'w', encoding='utf-8') as yaml_file:
    yaml.dump(document_structure, yaml_file, allow_unicode=True, sort_keys=False)


*** No part for chapter *** תוכן עניינים
*** No chapter for clause *** 1.
*** No chapter for clause *** 138.


### Split to specific clauses markdown files
Load the previous step json file, and split the content to md files, each file will contain a specific clause with all relevant details

In [21]:
import json

MAX_CHUNK_LENGTH = 100000

# Load the JSON data
with open('output/document_structure.json', 'r', encoding='utf-8') as json_file:
    document_structure = json.load(json_file)

def format_path_part(part):
    if part.get('number') and part.get('title'):
        ret = f'{part["number"]} - {part["title"]}'
    else:
        return None
    if part.get('subtitle'):
        ret += f' ({part["subtitle"]})'
    return ret

def all_clauses(structure):
    for part in structure:
        for chapter in part.get("chapters", []):
            path = [part, chapter]
            for clause in chapter.get("clauses", []):
                yield path, clause

# Function to recursively generate the full markdown content
def generate_full_markdown_content(structure):
    markdown_content = []
    for path, clause in all_clauses(structure):
        if not clause.get('text') and not clause.get('subclauses'):
            continue
        subclauses = clause.get('subclauses', []) + [None]
        cont = False
        markdown_content_length = len(markdown_content)
        while subclauses:
            success = False
            content = []
            path_ = list(filter(None, (format_path_part(p) for p in path))) + [f'סעיף {clause["number"]}']
            title = clause['title']
            text = clause['text']
            cont_text = ' (המשך)' if cont else ''
            content.append(f'**{title}{cont_text}**')
            content.append('')
            content.append(f'(מקור: {' / '.join(path_)})')
            if text:
                content.append(text)
            while subclauses:
                sc = subclauses.pop(0)
                if sc is None:
                    break
                sc_content = []
                sc_text = sc['text'] or ''
                sc_content.append(f' * {sc["number"]} {sc_text}')

                ssc_content = []
                for ssc in sc.get('sub_subclauses', []):
                    ssc_content.append(f'   * {ssc["number"]} {ssc["text"]}')
                if len('\n'.join(content + sc_content + ssc_content)) > MAX_CHUNK_LENGTH:
                    cont = True
                    if success:
                        subclauses.insert(0, sc)
                        markdown_content.append('\n'.join(content))
                        break
                    else:
                        ssc_content_ = []
                        content_ = content + sc_content
                        while len(ssc_content) > 0:
                            ssc = ssc_content.pop(0)
                            ssc_content_.append(ssc)
                            if len('\n'.join(content + ssc_content_)) > MAX_CHUNK_LENGTH:
                                assert len(ssc_content_) > 1
                                ssc_content.insert(0, ssc)
                                print('!!! 1', content_, '!!!')
                                markdown_content.append('\n'.join(content_))
                                ssc_content_ = []                            
                                content_ = content + sc_content
                                print('!!! 2', markdown_content[-1], '!!!')
                            else:
                                content_.append(ssc)
                        markdown_content.append('\n'.join(content_))

                else:
                    success = True
                    content += sc_content + ssc_content
        markdown_content.append('\n'.join(content))
        for i, mi in enumerate(range(markdown_content_length, len(markdown_content))):
            if len(markdown_content) - markdown_content_length > 1:
                with open(f'output_markdown/{clause["number"]}_{i}.md', 'w', encoding='utf-8') as f:
                    f.write(markdown_content[mi])
            else:
                with open(f'output_markdown/{clause["number"]}.md', 'w', encoding='utf-8') as f:
                    f.write(markdown_content[mi])

    return '\n\nPART\n\n'.join(markdown_content)

# Generate the full markdown content
full_markdown_content = generate_full_markdown_content(document_structure)

# Write the content to a single markdown file
with open('output/document_structure.md', 'w', encoding='utf-8') as file:
    file.write(full_markdown_content)

print("A single structured Markdown file has been generated: 'output/document_structure.md'")


A single structured Markdown file has been generated: 'output/document_structure.md'
