In [8]:
# change sujato paragraph ####3 -> 3.
import sys
import re
import os

def process_markdown_headers(input_filepath, output_filepath):
    """
    Reads a markdown file, finds lines starting with '#### <number/range> ',
    and replaces them with r'<number/range>. <rest of line>'.
    Handles cases where the content is on the same line or the next non-blank line.

    Args:
        input_filepath (str): The path to the input markdown file.
        output_filepath (str): The path where the modified content will be saved.
    """
    # Regex explanation:
    # ^####          - Matches the start of the line followed by '#### '
    # (\d+(--\d+)?) - Group 1: Captures one or more digits (\d+), optionally followed by
    #                '--' and one or more digits (--\d+). This captures the number or range.
    # \s*            - Matches zero or more whitespace characters after the number/range.
    # (.*)           - Group 3: Captures the rest of the line after the whitespace.
    # $              - Matches the end of the line.
    # This regex captures both "#### 98" (where rest_of_line is empty)
    # and "#### 101 Text" (where rest_of_line is "Text").
    header_pattern = re.compile(r'^#### (\d+(--\d+)?)\s*(.*)$')

    pending_number = None # Use a state variable to remember the number from a #### line

    try:
        # Ensure the directory for the output file exists
        output_dir = os.path.dirname(output_filepath)
        if output_dir and not os.path.exists(output_dir):
            os.makedirs(output_dir)

        with open(input_filepath, 'r', encoding='utf-8') as infile, \
             open(output_filepath, 'w', encoding='utf-8') as outfile:

            for line in infile:
                # If we are currently waiting for the content after a #### line
                if pending_number is not None:
                    stripped_line = line.strip()
                    if stripped_line:
                        # This is the first non-blank line after the ####
                        # Combine the pending number and the content
                        # Use \\. to ensure a literal period is outputted in markdown
                        new_line = f"{pending_number}\\. {stripped_line}\n"
                        outfile.write(new_line)
                        pending_number = None # Reset the state
                    # If stripped_line is empty (blank line), just discard it by doing nothing
                    continue # Move to the next line in the input file

                # If not waiting for content, check if the current line is a #### header
                match = header_pattern.match(line)
                if match:
                    number_or_range = match.group(1)
                    rest_of_line = match.group(3).strip() # Strip whitespace from captured text

                    if rest_of_line:
                        # The #### header had content on the same line
                        new_line = f"{number_or_range}\\. {rest_of_line}\n"
                        outfile.write(new_line)
                        # No pending state needed as content was processed immediately
                    else:
                        # The #### header had no content on the same line
                        # Store the number and wait for the next non-blank line
                        pending_number = number_or_range
                        # Do NOT write the #### line itself to the output yet
                else:
                    # This line is not a #### header and we are not waiting for content
                    # Write the line as is
                    outfile.write(line)

            # If the file ends and pending_number is not None, it means a ####
            # was the last relevant content. No action needed based on requirement.

        print(f"Successfully processed '{input_filepath}' and saved to '{output_filepath}'")

    # except FileNotFoundError:
    #     print(f"Error: Input file not found at '{input_filepath}'", file=sys.stderr)

    # except PermissionError:
    #     print(f"Error: Permission denied to access files. Check permissions for '{input_filepath}' or '{output_filepath}'", file=sys.stderr)

    except Exception as e:
        print(f"An unexpected error occurred: {e}", file=sys.stderr)




# Process the file'
input_file = "../.docsource/kinh-tang-chi/sujato/1.md"  # Example output file
output_file = "../.docsource/kinh-tang-chi/sujato/1-1.md"
process_markdown_headers(input_file, output_file)

Successfully processed '../.docsource/kinh-tang-chi/sujato/1.md' and saved to '../.docsource/kinh-tang-chi/sujato/1-1.md'


In [None]:
# add <!--pg-->
import sys
import re
import os

def process_markdown_headers(input_filepath, output_filepath):
    """
    Reads a markdown file, finds lines starting with '#### <number/range>',
    replaces them with r'<number/range>\. <rest of line>', and adds
    a blank line and '<!--pg-->' after the entire content block.
    Handles multi-line content, initial blank lines after the H4,
    same-line content, and block termination by a blank line or any header.

    Args:
        input_filepath (str): The path to the input markdown file.
        output_filepath (str): The path where the modified content will be saved.
    """
    # Regex for the target H4 header: #### <number/range> optional_text
    # Captures the number/range in group 1 and optional rest of the line in group 3.
    header_pattern = re.compile(r'^#### (\d+(--\d+)?)\s*(.*)$')
    # Regex to identify any markdown header (H1, H2, H3, H4, etc.)
    any_header_pattern = re.compile(r'^\s*#+')

    try:
        # Ensure the directory for the output file exists
        output_dir = os.path.dirname(output_filepath)
        if output_dir and not os.path.exists(output_dir):
            os.makedirs(output_dir)

        with open(input_filepath, 'r', encoding='utf-8') as infile, \
             open(output_filepath, 'w', encoding='utf-8') as outfile:

            lines = infile.readlines() # Read all lines into memory for easier lookahead
            i = 0 # Index for the main loop through lines

            while i < len(lines):
                line = lines[i]
                is_h4 = header_pattern.match(line)

                if is_h4:
                    # Found the start of a potential #### block
                    number_or_range = is_h4.group(1)
                    rest_of_h4_line = is_h4.group(3).strip()

                    content_lines_raw = [] # Buffer for lines after the H4 line
                    first_content_found = False # Flag: have we seen a non-blank line after initial blanks?

                    # Look ahead for the rest of the content block
                    j = i + 1 # Start looking from the line *after* the H4 line
                    while j < len(lines):
                        next_line = lines[j]
                        is_next_any_header = any_header_pattern.match(next_line)
                        is_next_blank = next_line.strip() == ''

                        # Termination condition for the content block:
                        # 1. The next line is any header (H1, H2, H3, etc.)
                        # 2. The next line is blank AND we have already collected some content lines.
                        #    This handles blank lines *between* blocks as terminators.
                        if is_next_any_header or (is_next_blank and first_content_found):
                            break # Found the terminator line, stop collecting content for this block

                        # If we reach here, the next line is not a block terminator.
                        # If it's a blank line *before* any content has been found, consume it and continue the lookahead.
                        if is_next_blank and not first_content_found:
                            j += 1 # Consume this initial blank line
                            continue # Go to the next line in the inner loop

                        # If we reach here, next_line is not a header and not an initial blank line.
                        # It must be a content line (could be blank *within* the block if first_content_found is True).
                        content_lines_raw.append(next_line)
                        # Mark first_content_found the moment we add the first non-blank line
                        if not first_content_found: # Check state *before* adding the line
                             if not is_next_blank: # Only set if the line being added is not blank itself
                                first_content_found = True

                        j += 1 # Move to the next line in the input

                    # Prepare the final list of content lines to write
                    if rest_of_h4_line:
                         # Add the same-line content as the effective first line.
                         # Ensure it ends with a newline for consistency if it didn't originally.
                         if not rest_of_h4_line.endswith('\n'):
                             rest_of_h4_line += '\n'
                         processed_content_lines = [rest_of_h4_line] + content_lines_raw
                    else:
                         # Use the raw buffered lines starting from the first non-blank after H4
                         processed_content_lines = content_lines_raw

                    # Process and write the collected block *if* there were any content lines
                    # We only write if content_lines_raw was non-empty OR rest_of_h4_line was non-empty
                    if processed_content_lines:
                        # Format and write the first line
                        # Strip leading/trailing space from the very first content line
                        first_line_content = processed_content_lines[0].strip()
                        # Use \\. in the f-string to output the literal '\.'
                        outfile.write(f"{number_or_range}\\. {first_line_content}\n")

                        # Write subsequent buffered lines (already include original newlines)
                        for subsequent_line in processed_content_lines[1:]:
                            outfile.write(subsequent_line)

                        # Add the blank line and the marker after the content block
                        outfile.write("\n")
                        outfile.write("<!--pg-->")

                    # Move the main loop index past the H4 line and all the lines that were buffered as content.
                    # The line at index j is the terminator (header or blank after content),
                    # or the end of the file. The next iteration should start processing *that* line.
                    i = j

                else:
                    # Not a #### header, just write the line as is
                    outfile.write(line)
                    i += 1 # Move to the next line in the input


        print(f"Successfully processed '{input_filepath}' and saved to '{output_filepath}'")

    except FileNotFoundError:
        print(f"Error: Input file not found at '{input_filepath}'", file=sys.stderr)
        sys.exit(1)
    except PermissionError:
        print(f"Error: Permission denied to access files. Check permissions for '{input_filepath}' or '{output_filepath}'", file=sys.stderr)
        sys.exit(1)
    except Exception as e:
        print(f"An unexpected error occurred: {e}", file=sys.stderr)
        sys.exit(1)

input_file = "../.docsource/kinh-tang-chi/sujato/1.md"  # Example output file
output_file = "../.docsource/kinh-tang-chi/sujato/1-1.md"
process_markdown_headers(input_file, output_file)

Successfully processed '../.docsource/kinh-tang-chi/sujato/1.md' and saved to '../.docsource/kinh-tang-chi/sujato/1-1.md'


  """


In [10]:
# add paragraph for thichminhchau

import argparse
import re
import sys
import os

def modify_markdown(input_file_path: str, output_file_path: str):
    """
    Reads a Markdown file, inserts '<!--pg-->' before lines starting
    with a number or number range followed by '\. ', and writes the
    result to a new file.

    Args:
        input_file_path: Path to the input Markdown file.
        output_file_path: Path to the output Markdown file.
    """
    # Regex to find lines starting with:
    # - One or more digits (\d+)
    # - Optionally, a hyphen and more digits (?:-\d+)? for ranges like 7-10
    # - Followed by a literal backslash and period ('\.') - need to escape backslash in regex
    # - Followed by a space (' ')
    # ^ anchors the match to the start of the line.
    pattern = re.compile(r"^\d+(?:-\d+)?\\\. ")
    pattern = re.compile(r"^\d+(?:-\d+)?")
    pattern = re.compile(r"^\d+(?:-\d+)?(?:\\\.|\.) ")

    print(f"Processing '{input_file_path}'...")

    try:
        # Ensure the output directory exists if specified as part of the path
        output_dir = os.path.dirname(output_file_path)
        if output_dir and not os.path.exists(output_dir):
            os.makedirs(output_dir)
            print(f"Created output directory: '{output_dir}'")

        with open(input_file_path, 'r', encoding='utf-8') as infile, \
             open(output_file_path, 'w', encoding='utf-8') as outfile:

            for line in infile:
                # Check if the current line matches the pattern
                if pattern.match(line):
                    # If it matches, write the marker first, followed by a newline
                    outfile.write("<!--pg-->\n")
                # Write the original line (it includes its own newline character)
                outfile.write(line)

        print(f"Successfully modified content written to '{output_file_path}'.")

    except FileNotFoundError:
        print(f"Error: Input file not found at '{input_file_path}'", file=sys.stderr)
        sys.exit(1)
    except IOError as e:
        print(f"Error reading or writing file: {e}", file=sys.stderr)
        sys.exit(1)
    except Exception as e:
        print(f"An unexpected error occurred: {e}", file=sys.stderr)
        sys.exit(1)

input_file = "../.docsource/kinh-tang-chi/thichminhchau/01-002-pham-doan-trien-cai.md"  # Example output file
output_file = "../.docsource/kinh-tang-chi/thichminhchau/01-002-pham-doan-trien-cai-1.md"
modify_markdown(input_file, output_file)

Processing '../.docsource/kinh-tang-chi/thichminhchau/01-002-pham-doan-trien-cai.md'...
Successfully modified content written to '../.docsource/kinh-tang-chi/thichminhchau/01-002-pham-doan-trien-cai-1.md'.


  """


In [None]:
import os
file_list = [os.path.join('../.docsource/kinh-tang-chi/thichminhchau/', f) for f in os.listdir('../.docsource/kinh-tang-chi/thichminhchau/') if f.endswith('.md')]

for f in file_list:
    convert_markdown(f, f)
