In [None]:
"""
export const OCR_SYSTEM_PROMPT = `
Convert the following document to markdown.
Return only the markdown with no explanation text. Do not include delimiters like '''markdown or '''html.

RULES:
  - You must include all information on the page. Do not exclude headers, footers, charts, infographics, or subtext.
  - Return tables in an HTML format.
  - Logos should be wrapped in brackets. Ex: <logo>Coca-Cola<logo>
  - Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY<watermark>
  - Page numbers should be wrapped in brackets. Ex: <page_number>14<page_number> or <page_number>9/22<page_number>
  - Prefer using ☐ and ☑ for check boxes.
`;

export const JSON_EXTRACTION_SYSTEM_PROMPT = `
  Extract the following JSON schema from the text and images if provided.
  Return only the JSON with no explanation text.
`;

export const IMAGE_EXTRACTION_SYSTEM_PROMPT = `
  Extract the following JSON schema from the image.
  Return only the JSON with no explanation text.
`;
"""

from PyPDF2 import PdfReader, PdfWriter
import os

def extract_pdf_pages(input_pdf, output_pdf, start_page, end_page):
    """
    Extracts a range of pages from a PDF file and saves them as a new PDF.

    Args:
        input_pdf: Path to the input PDF file
        output_pdf: Path to save the extracted PDF file
        start_page: Starting page number (0-based index)
        end_page: Ending page number (0-based index, inclusive)
    """
    reader = PdfReader(input_pdf)
    writer = PdfWriter()

    # Validate page range
    total_pages = len(reader.pages)
    if start_page < 0 or end_page >= total_pages or start_page > end_page:
        raise ValueError(f"Invalid page range. PDF has {total_pages} pages.")

    # Add selected pages to the new PDF
    for page_num in range(start_page, end_page + 1):
        writer.add_page(reader.pages[page_num])

    # Save the new PDF
    with open(output_pdf, "wb") as output_file:
        writer.write(output_file)

    print(f"Created PDF with pages {start_page+1} to {end_page+1} at {output_pdf}")

filename = '../../docsource/trungbo-eng-nanamoli-bodhi2.out1.pdf'
# 1, 4, 58
# 2, 59, 73
# 3, 435, 814

args = {"output_pdf": f'../../docsource/trungbo-eng-nanamoli-bodhi.part{0}.pdf', "start_page": 11, "end_page": 73}
args = {"output_pdf": f'../../docsource/trungbo-eng-nanamoli-bodhi.part{1}.pdf', "start_page": 77, "end_page": 430}
# p0="https://drive.usercontent.google.com/u/0/uc?id=1dOGYtQABuCOlVdY8WmLlZJShG1jCYKkm&export=download"
p1 = "https://drive.usercontent.google.com/u/0/uc?id=1ak_jgAYW7S1iD-fgqB63-AYwngVcuFrO&export=download"
# args = {"output_pdf": f'../../docsource/trungbo-eng-nanamoli-bodhi.part{2}.pdf', "start_page": 435, "end_page": 814}
# p1 = "https://drive.usercontent.google.com/u/0/uc?id=1OQyCzluVkULUSNFrhMcEm1dSrc_TFbt0&export=download"
args = {"output_pdf": f'../../docsource/trungbo-eng-nanamoli-bodhi.part{3}.pdf', "start_page": 819, "end_page": 1143}
p3="https://drive.usercontent.google.com/u/0/uc?id=1MgGtGPfM86NewlhT2P94PMQwbvU18-UZ&export=download"
# args = {"output_pdf": f'../../docsource/trungbo-eng-nanamoli-bodhi.part{4}.pdf', "start_page": 1147, "end_page": 1366}
# p4 = "https://drive.usercontent.google.com/u/0/uc?id=1B7MzZsamRaO31v2hWjpJTEKdrMWeM1Z5&export=download"
extract_pdf_pages('../../docsource/trungbo-eng-nanamoli-bodhi2.pdf', **args)




In [2]:
# step 2 : mistral for ocr
from mistralai import Mistral
import os
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv(), override=True)

api_key = os.environ.get("MISTRALAI_KEY")
client = Mistral(api_key=api_key)

ocr_response = client.ocr.process(
    model="mistral-ocr-latest",
    document={
        "type": "document_url",
        "document_url": "https://drive.usercontent.google.com/u/0/uc?id=1MgGtGPfM86NewlhT2P94PMQwbvU18-UZ&export=download"
    },
)

In [None]:
print(ocr_response.pages[0].markdown)


In [3]:
# step 3: write to markdown
import re
def is_new_paragraph_start(first_line):
    """Kiểm tra xem dòng đầu tiên có vẻ là bắt đầu đoạn văn mới không."""
    if not first_line: # Trang trống
        return True
    first_char = first_line[0]
    return first_char.isupper() or first_char.isdigit() or first_char == '#' # Đơn giản: bắt đầu bằng chữ hoa có thể là đoạn mới (cần tinh chỉnh)



def replace_newline_split_join_no_generator(text):
    """
    Split and join version, but without using a generator expression.
    Potentially a bit more readable, and may be slightly more performant for
    *very* large strings (due to avoiding repeated string concatenation)
    """
    parts = text.split("\n\n")
    modified_parts = []
    for part in parts:
        modified_parts.append(part.replace("\n", "\n\n"))
    return "\n\n".join(modified_parts)

output_file="trungbo-eng-nanamoli-bodhi.part3.md"
with open(output_file, "w", encoding="utf-8") as f:
    for i, page in enumerate(ocr_response.pages):
        txt = page.markdown# .replace("\n", "\n\n")
        txt = replace_newline_split_join_no_generator(txt)
        f.write(txt)
        if i < len(ocr_response.pages) - 1: # Không thêm vào trang cuối
            next_page_first_line = ocr_response.pages[i+1].markdown.strip().splitlines()[0] if ocr_response.pages[i+1].markdown.strip().splitlines() else ""
            if is_new_paragraph_start(next_page_first_line):
                f.write("\n\n") # Hoặc f.write("\n---\n")
            else:
                f.write("\n") # Hoặc không thêm gì cả, tùy thử nghiệm

In [5]:
# step 4. split file for chapter (part 1,2,3)

import os
import re
from slugify import slugify

def split_markdown_file(input_file, output_folder):
    """
    Splits a Markdown file into smaller files based on top-level headers (#).

    Args:
        input_file: Path to the input Markdown file.
        output_folder: Path to the folder where the output files will be saved.
    """

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            content = f.read()
    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
        return
    except Exception as e:
        print(f"Error reading file: {e}")
        return


    # Simpler regex to just find the header lines
    pattern = r'^(#\s+\d+\s+.*)$'
    lines = content.splitlines()
    sections = []
    current_section = []

    for line in lines:
        if re.match(pattern, line):
            if current_section:
                sections.append(current_section)
            current_section = [line]
        else:
            current_section.append(line)
    sections.append(current_section)  # Add the last section


    for section in sections:
        if not section:  # Skip empty sections
            continue

        header_line = section[0]
        body = "\n".join(section[1:]).strip()

        # Extract number and the rest of the title
        match = re.match(r'^#\s+(\d+)\s+(.*)$', header_line)
        if not match:
            print(f"Warning: Could not parse header: {header_line}")
            continue  # Skip this section if header is malformed
        number = match.group(1).strip()
        title_text = match.group(2).strip()


        # Split the title text by "Sutta"
        parts = title_text.split("Sutta")
        if len(parts) < 2:
            print(f"Warning: 'Sutta' not found in header: {header_line}")
            pali_title = title_text  # Use the full title as a fallback
            english_title = ""

        else:
            pali_prefix = parts[0].strip()
            pali_title = f"{pali_prefix} Sutta"
            english_title = "Sutta".join(parts[1:]).strip()  # Join in case "Sutta" appears in the English title


        # Create the new header
        new_header = f"# {number} {english_title}\n***({pali_title})***"

        # Create slug
        slug = slugify(english_title)
        #slug = re.sub(r'[^\w-]', '', slug)   # Remove characters that are not word characters or hyphens

        # Create file name
        file_number = f"{int(number):03}"
        file_name = f"{file_number}-{slug}.md"
        file_path = os.path.join(output_folder, file_name)

        # Write the content to the new file
        try:
            with open(file_path, 'w', encoding='utf-8') as outfile:
                outfile.write(new_header + "\n\n") #add 2 new line.
                outfile.write(body)
            print(f"Created file: {file_path}")
        except Exception as e:
           print(f"Error writing to file '{file_path}': {e}")


filename="trungbo-eng-nanamoli-bodhi.part3.md"

split_markdown_file(filename, "nm-p3")

Created file: nm-p3/101-at-devadaha.md
Created file: nm-p3/102-the-five-and-three.md
Created file: nm-p3/103-what-do-you-think-about-me.md
Created file: nm-p3/104-at-samagama.md
Created file: nm-p3/105-to-sunakkhatta.md
Created file: nm-p3/106-the-way-to-the-imperturbable.md
Created file: nm-p3/107-to-ganaka-moggallana.md
Created file: nm-p3/108-with-gopaka-moggallana.md
Created file: nm-p3/109-the-greater-discourse-on-the-full-moon-night.md
Created file: nm-p3/110-the-shorter-discourse-on-the-full-moon-night.md
Created file: nm-p3/111-one-by-one-as-they-occurred.md
Created file: nm-p3/112-the-sixfold-purity.md
Created file: nm-p3/113-the-true-man.md
Created file: nm-p3/114-to-be-cultivated-and-not-to-be-cultivated.md
Created file: nm-p3/115-the-many-kinds-of-elements.md
Created file: nm-p3/116-isigili-the-gullet-of-the-seers.md
Created file: nm-p3/117-the-great-forty.md
Created file: nm-p3/118-mindfulness-of-breathing.md
Created file: nm-p3/119-mindfulness-of-the-body.md
Created file:

In [None]:
# step 4. split file for preface (part 0)

import os
import re
from slugify import slugify  # Install with: pip install python-slugify

def split_markdown(input_file, output_folder):
    """
    Splits a Markdown file into multiple files based on H1 headings (# Title).
    Filenames are slugified versions of the titles.

    Args:
        input_file: The path to the input Markdown file.
        output_folder: The directory to save the output files.
    """

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            content = f.read()
    except FileNotFoundError:
        print(f"Error: Input file '{input_file}' not found.")
        return
    except Exception as e:
        print(f"An error occurred while reading the file: {e}")
        return

    pattern = r"^\s*#\s+(.+)\s*$"
    parts = re.split(pattern, content, flags=re.MULTILINE)

    if len(parts) < 2:
        print("No H1 headings found.  No files created.")
        return

    file_counter = 0
    first_file_name = os.path.join(output_folder, f"{file_counter}.introduction.md")
    with open(first_file_name, "w", encoding='utf-8') as outfile:
        outfile.write(parts[0].strip())

    for i in range(1, len(parts) - 1, 2):
        title = parts[i].strip()
        content_after_title = parts[i + 1].strip()
        #content_after_title = content_after_title.replace("\n", "\n\n")
        file_counter += 1
        slugified_title = slugify(title)  # Slugify the title
        file_name = os.path.join(output_folder, f"{file_counter}.{slugified_title}.md")
        with open(file_name, 'w', encoding='utf-8') as outfile:
            outfile.write(f"# {title}\n\n{content_after_title}")

    print(f"Successfully split '{input_file}' into {file_counter} files in '{output_folder}'.")



input_filename = "nm.p4.md"
output_directory = "nm-p4"
split_markdown(input_filename, output_directory)




In [6]:
import re
import os
import argparse


def split_note(filename, output_folder):
    """
    Splits a Markdown file by SUTTA titles (e.g., "SUTTA 15") and saves each
    Sutta into a separate file.

    Args:
        filename: The path to the input Markdown file.
        output_folder: The path to the directory where the split files
            will be saved.
    """

    try:
        with open(filename, 'r', encoding='utf-8') as f:
            content = f.read()
    except FileNotFoundError:
        print(f"Error: File '{filename}' not found.")
        return
    except Exception as e:
        print(f"Error reading file '{filename}': {e}")
        return

    # Create the output directory if it doesn't exist.
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Regular expression to find "SUTTA <number>" titles.  We use \s+ to
    # handle multiple spaces between "SUTTA" and the number, and make it
    # case-insensitive.
    sutta_pattern = re.compile(r"^SUTTA\s+(\d+)$", re.MULTILINE | re.IGNORECASE)

    # Find all Sutta start positions.  Add a "start" at the beginning
    # of the file, and an implicit "end" at the end of the file.
    sutta_starts = [0]  # Add the beginning of the file
    for match in sutta_pattern.finditer(content):
        sutta_starts.append(match.start())
    sutta_starts.append(len(content)) # add the end of file

    # Extract and save each Sutta.
    for i in range(len(sutta_starts) - 1):
        start = sutta_starts[i]
        end = sutta_starts[i+1]
        sutta_text = content[start:end]

        # Extract the Sutta number.
        match = sutta_pattern.search(sutta_text)

        if match:
             sutta_number_str = match.group(1)
        elif i == 0: # if first sutta is not found, check it from input.
            match_first = re.search(r"^SUTTA\s+(\d+)", sutta_text, re.IGNORECASE)
            if match_first:
               sutta_number_str = match_first.group(1)
            else:
                print(f"Warning: Could not determine number for first segment. skipping it")
                continue
        else:
            print(f"Warning: Could not extract Sutta number between {start} and {end}. Skipping this section.")
            continue

        sutta_number = int(sutta_number_str)
        # Format the filename.
        output_filename = os.path.join(output_folder, f"{sutta_number:03}.md")

        # Save the Sutta to a file.
        try:
            with open(output_filename, 'w', encoding='utf-8') as outfile:
                outfile.write(sutta_text)
            print(f"Saved Sutta {sutta_number} to {output_filename}")
        except Exception as e:
            print(f"Error writing to file '{output_filename}': {e}")



filename = './nm-p4/3.notes.md'
output_folder = './nm-p4/notes'
split_note(filename, output_folder)



Saved Sutta 1 to ./nm-p4/notes/001.md
Saved Sutta 2 to ./nm-p4/notes/002.md
Saved Sutta 3 to ./nm-p4/notes/003.md
Saved Sutta 4 to ./nm-p4/notes/004.md
Saved Sutta 5 to ./nm-p4/notes/005.md
Saved Sutta 6 to ./nm-p4/notes/006.md
Saved Sutta 7 to ./nm-p4/notes/007.md
Saved Sutta 8 to ./nm-p4/notes/008.md
Saved Sutta 9 to ./nm-p4/notes/009.md
Saved Sutta 10 to ./nm-p4/notes/010.md
Saved Sutta 11 to ./nm-p4/notes/011.md
Saved Sutta 12 to ./nm-p4/notes/012.md
Saved Sutta 13 to ./nm-p4/notes/013.md
Saved Sutta 14 to ./nm-p4/notes/014.md
Saved Sutta 15 to ./nm-p4/notes/015.md
Saved Sutta 16 to ./nm-p4/notes/016.md
Saved Sutta 17 to ./nm-p4/notes/017.md
Saved Sutta 18 to ./nm-p4/notes/018.md
Saved Sutta 19 to ./nm-p4/notes/019.md
Saved Sutta 20 to ./nm-p4/notes/020.md
Saved Sutta 21 to ./nm-p4/notes/021.md
Saved Sutta 22 to ./nm-p4/notes/022.md
Saved Sutta 23 to ./nm-p4/notes/023.md
Saved Sutta 24 to ./nm-p4/notes/024.md
Saved Sutta 25 to ./nm-p4/notes/025.md
Saved Sutta 26 to ./nm-p4/notes/02