In [6]:
"""
export const OCR_SYSTEM_PROMPT = `
Convert the following document to markdown.
Return only the markdown with no explanation text. Do not include delimiters like '''markdown or '''html.

RULES:
  - You must include all information on the page. Do not exclude headers, footers, charts, infographics, or subtext.
  - Return tables in an HTML format.
  - Logos should be wrapped in brackets. Ex: <logo>Coca-Cola<logo>
  - Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY<watermark>
  - Page numbers should be wrapped in brackets. Ex: <page_number>14<page_number> or <page_number>9/22<page_number>
  - Prefer using ☐ and ☑ for check boxes.
`;

export const JSON_EXTRACTION_SYSTEM_PROMPT = `
  Extract the following JSON schema from the text and images if provided.
  Return only the JSON with no explanation text.
`;

export const IMAGE_EXTRACTION_SYSTEM_PROMPT = `
  Extract the following JSON schema from the image.
  Return only the JSON with no explanation text.
`;
"""

from PyPDF2 import PdfReader, PdfWriter
import os

def extract_pdf_pages(input_pdf, output_pdf, start_page, end_page):
    """
    Extracts a range of pages from a PDF file and saves them as a new PDF.

    Args:
        input_pdf: Path to the input PDF file
        output_pdf: Path to save the extracted PDF file
        start_page: Starting page number (0-based index)
        end_page: Ending page number (0-based index, inclusive)
    """
    reader = PdfReader(input_pdf)
    writer = PdfWriter()

    # Validate page range
    total_pages = len(reader.pages)
    if end_page >= total_pages:
        end_page = total_pages - 1
    if start_page < 0 or end_page >= total_pages or start_page > end_page:
        raise ValueError(f"Invalid page range. PDF has {total_pages} pages.")

    # Add selected pages to the new PDF
    for page_num in range(start_page, end_page + 1):
        writer.add_page(reader.pages[page_num])

    # Save the new PDF
    with open(output_pdf, "wb") as output_file:
        writer.write(output_file)

    print(f"Created PDF with pages {start_page+1} to {end_page+1} at {output_pdf}")

filename = '../../docsource/trungbo-eng-nanamoli-bodhi2.out1.pdf'
# 1, 4, 58
# 2, 59, 73
# 3, 435, 814

args = {"output_pdf": '../.docsource/kinhtuongung/tuongungbokinh-2.pdf', "start_page": 501, "end_page": 5000}
p0="https://drive.usercontent.google.com/u/0/uc?id=1dOGYtQABuCOlVdY8WmLlZJShG1jCYKkm&export=download"
# args = {"output_pdf": f'../../docsource/trungbo-eng-nanamoli-bodhi.part{2}.pdf', "start_page": 435, "end_page": 814}
p1 = "https://drive.usercontent.google.com/u/0/uc?id=1OQyCzluVkULUSNFrhMcEm1dSrc_TFbt0&export=download"
# args = {"output_pdf": f'../../docsource/trungbo-eng-nanamoli-bodhi.part{3}.pdf', "start_page": 819, "end_page": 1143}
extract_pdf_pages('../.docsource/kinhtuongung/tuongungbokinh.pdf', **args)



Created PDF with pages 502 to 1092 at ../.docsource/kinhtuongung/tuongungbokinh-2.pdf


In [15]:
# step 2 : mistral for ocr
import os
from mistralai import Mistral
import os
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv(), override=True)

api_key = os.environ.get("MISTRALAI_KEY")
client = Mistral(api_key=api_key)

ocr_response = client.ocr.process(
    model="mistral-ocr-latest",
    document={
        "type": "document_url",
        "document_url": "https://drive.usercontent.google.com/uc?id=1IEQ6bIZTBhqfR6LodrjcBZhDyiQIIsdn&export=download"
    },
)

In [16]:
print(ocr_response.pages[0].markdown)


5 (5) Wealth in Brief
"Bhikkhus, there are these seven kinds of wealth. What seven? The wealth of faith, the wealth of virtuous behavior, the wealth of moral shame, the wealth of moral dread, the wealth of learning, the wealth of generosity, and the wealth of wisdom. [5] These are the seven kinds of wealth."

The wealth of faith, the wealth of virtuous behavior, the wealth of moral shame and moral dread, the wealth of learning and generosity, with wisdom, the seventh kind of wealth:
when one has these seven kinds of wealth, whether a woman or a man, they say that one is not poor, that one's life is not lived in vain.

Therefore an intelligent person, remembering the Buddhas' teaching, should be intent on faith and virtuous behawic confidence and vision of the Dhamma.

# 6 (6) Wealth in Detail 

"Bhikkhus, there are these seven kinds of wealth. What seven? The wealth of faith, the wealth of virtuous behavior, the wealth of moral shame, the wealth of moral dread, the wealth of learning, 

In [17]:
# step 3: write to markdown
import re
def is_new_paragraph_start(first_line):
    """Kiểm tra xem dòng đầu tiên có vẻ là bắt đầu đoạn văn mới không."""
    if not first_line: # Trang trống
        return True
    first_char = first_line[0]
    return first_char.isupper() or first_char.isdigit() or first_char == '#' # Đơn giản: bắt đầu bằng chữ hoa có thể là đoạn mới (cần tinh chỉnh)

output_file="The_Numerical_Discourses_of_the_BuddhaAnguttara_NikayaBodhi2012_p2.md"

def replace_newline_split_join_no_generator(text):
    """
    Split and join version, but without using a generator expression.
    Potentially a bit more readable, and may be slightly more performant for
    *very* large strings (due to avoiding repeated string concatenation)
    """
    parts = text.split("\n\n")
    modified_parts = []
    for part in parts:
        modified_parts.append(part.replace("\n", "\n\n"))
    return "\n\n".join(modified_parts)

with open(output_file, "w", encoding="utf-8") as f:
    for i, page in enumerate(ocr_response.pages):
        txt = page.markdown# .replace("\n", "\n\n")
        txt = replace_newline_split_join_no_generator(txt)
        f.write(txt)
        if i < len(ocr_response.pages) - 1: # Không thêm vào trang cuối
            next_page_first_line = ocr_response.pages[i+1].markdown.strip().splitlines()[0] if ocr_response.pages[i+1].markdown.strip().splitlines() else ""
            if is_new_paragraph_start(next_page_first_line):
                f.write("\n\n") # Hoặc f.write("\n---\n")
            else:
                f.write("\n") # Hoặc không thêm gì cả, tùy thử nghiệm

In [None]:
# step 4. split file for chapter (part 1,2,3)

import os
import re
from slugify import slugify

def split_markdown_file(input_file, output_folder):
    """
    Splits a Markdown file into smaller files based on top-level headers (#).

    Args:
        input_file: Path to the input Markdown file.
        output_folder: Path to the folder where the output files will be saved.
    """

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            content = f.read()
    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
        return
    except Exception as e:
        print(f"Error reading file: {e}")
        return


    # Simpler regex to just find the header lines
    pattern = r'^(#\s+\d+\s+.*)$'
    lines = content.splitlines()
    sections = []
    current_section = []

    for line in lines:
        if re.match(pattern, line):
            if current_section:
                sections.append(current_section)
            current_section = [line]
        else:
            current_section.append(line)
    sections.append(current_section)  # Add the last section


    for section in sections:
        if not section:  # Skip empty sections
            continue

        header_line = section[0]
        body = "\n".join(section[1:]).strip()

        # Extract number and the rest of the title
        match = re.match(r'^#\s+(\d+)\s+(.*)$', header_line)
        if not match:
            print(f"Warning: Could not parse header: {header_line}")
            continue  # Skip this section if header is malformed
        number = match.group(1).strip()
        title_text = match.group(2).strip()


        # Split the title text by "Sutta"
        parts = title_text.split("Sutta")
        if len(parts) < 2:
            print(f"Warning: 'Sutta' not found in header: {header_line}")
            pali_title = title_text  # Use the full title as a fallback
            english_title = ""

        else:
            pali_prefix = parts[0].strip()
            pali_title = f"{pali_prefix} Sutta"
            english_title = "Sutta".join(parts[1:]).strip()  # Join in case "Sutta" appears in the English title


        # Create the new header
        new_header = f"# {number} {english_title}\n***({pali_title})***"

        # Create slug
        slug = slugify(english_title)
        #slug = re.sub(r'[^\w-]', '', slug)   # Remove characters that are not word characters or hyphens

        # Create file name
        file_number = f"{int(number):03}"
        file_name = f"{file_number}-{slug}.md"
        file_path = os.path.join(output_folder, file_name)

        # Write the content to the new file
        try:
            with open(file_path, 'w', encoding='utf-8') as outfile:
                outfile.write(new_header + "\n\n") #add 2 new line.
                outfile.write(body)
            print(f"Created file: {file_path}")
        except Exception as e:
           print(f"Error writing to file '{file_path}': {e}")


filename="trungbo-eng-nanamoli-bodhi.part2.md"

split_markdown_file(filename, "majjhima_output")

In [66]:
# step 4. split file for preface (part 0)

import os
import re
from slugify import slugify  # Install with: pip install python-slugify

def split_markdown(input_file, output_folder):
    """
    Splits a Markdown file into multiple files based on H1 headings (# Title).
    Filenames are slugified versions of the titles.

    Args:
        input_file: The path to the input Markdown file.
        output_folder: The directory to save the output files.
    """

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            content = f.read()
    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
        return
    except Exception as e:
        print(f"An error occurred while reading the file: {e}")
        return

    pattern = r"^\s*#\s+(.+)\s*$"
    parts = re.split(pattern, content, flags=re.MULTILINE)

    if len(parts) < 2:
        print("No H1 headings found.  No files created.")
        return

    file_counter = 0
    first_file_name = os.path.join(output_folder, f"{file_counter}.introduction.md")
    with open(first_file_name, "w", encoding='utf-8') as outfile:
        outfile.write(parts[0].strip())

    for i in range(1, len(parts) - 1, 2):
        title = parts[i].strip()
        content_after_title = parts[i + 1].strip()
        #content_after_title = content_after_title.replace("\n", "\n\n")
        file_counter += 1
        slugified_title = slugify(title)  # Slugify the title
        file_name = os.path.join(output_folder, f"{file_counter}.{slugified_title}.md")
        with open(file_name, 'w', encoding='utf-8') as outfile:
            outfile.write(f"# {title}\n\n{content_after_title}")

    print(f"Successfully split '{input_file}' into {file_counter} files in '{output_folder}'.")



input_filename = "nm.p0.md"
output_directory = "nm-p0"
split_markdown(input_filename, output_directory)




Successfully split 'nm.p0.md' into 23 files in 'nm-p0'.
