In [1]:

%pip install -qU pymupdf

Note: you may need to restart the kernel to use updated packages.


In [None]:
books_info = {
    "exclude_sentences": [
        'BASES PARA SUA CONDUTA',
    ],
    "001.pdf": {
        "title": "Bases para Sua Conduta",
        "table_name": "ptbr_bases_para_sua_conduta",
        "edition": 22,
        "exclude_pages": [1, 3, 4, 5, 6, 7, 9, 10, 58, 60]
    },
    "002.pdf": {
        "title": "A Herança de Si Mesmo",
        "table_name": "ptbr_heranca_si_mesmo",
        "edition": 22,
        "exclude_pages": [1, 3, 4, 5, 6, 8, 10, 29, 30, 31, 32, 33, 34, 35, 36]
    },
    "009.pdf": {
        "title": "Logosofia, Ciência e Método",
        "table_name": "ptbr_logosofia_ciencia_metodo",
        "edition": 12,
        "exclude_pages": [1, 3, 4, 5, 6, 9, 10, 11, 12, 148, 149, 150, 151, 152]
    }
}

# First step of text processing.

In [3]:
import pymupdf

def is_endline_next(text: str) -> bool:
    """Returns if the first character found, excluding empty spaces, is the `\\n` token.

    Args:
        text (str): The text to search.

    Returns:
        bool: True if the end line is found first, False otherwise.
    """

    for ch in text:
        if ch == ' ':
            continue

        if ch == '\n':
            return True
        else:
            return False
        
    return False

def is_ellipsis(text: str) -> bool:
    """Given a text with lenght 3, if checks if it is a ellipsis `...`.

    Args:
        text (str): Text, with a dit in the first position.

    Returns:
        bool: Returns True if there is a ellipsis. False otherwise.
    """    

    return text == "..."

def process_text(text: str, end_line_tokens: list[str] = ['.', '!', '?',]) -> list[str]:
    """Do the first step of text processing for a given page. This needs to happens before 
    all other processing. It tries to keep paragraphs organized and
    handle some edge cases. It does not remove end of line characters like '\n'.

    Args:
        text (str): The text to process.
        end_line_tokens (list[str], optional): The list of tokens that indicate the end of a line. Defaults to ['.', '!', '?']

    Returns:
        list[str]: The processed list of paragraphs.
    """    

    length = len(text)
    buffer = ''
    paragraphs = []
    ignore_next_new_line = False
    skip_next_char = False
    i = 0

    for ch in text:
        # If we reached the end of the text, append the last buffer and return
        if i == length - 1:
            paragraphs.append(buffer)
            return paragraphs
        
        if skip_next_char:
            skip_next_char = False
            i += 1
            continue

        # Accounts for that huge graphical letter that starts a paragraph.
        # This processing needs to happen before all the others.
        if ch == '\n' and ignore_next_new_line:
            ignore_next_new_line = False
            i += 1
            continue

        buffer += ch

        # That must be the huge letter, that starts a new paragraph
        if len(buffer) == 1 and buffer.isupper() and is_endline_next(text[i + 1:]):
            ignore_next_new_line = True

        # Accounts for break of line with '-'
        if ch == '-' and is_endline_next(text[i + 1:]):
            skip_next_char = True
            buffer = buffer[:-1]
            i += 1
            continue

        # We're not stopping until we see a real end of line terminator.
        if ch in end_line_tokens and is_endline_next(text[i + 1:]):
            paragraphs.append(buffer)
            buffer = ''
            i += 1
            continue

        i += 1

def get_text_and_page(pdf_path: str) -> dict:
    doc = pymupdf.open(pdf_path)

    book = {}
    index = 0
    page_count = 1

    for page in doc:
        text = page.get_text()

        if text:
            current_lines = process_text(text)
            book[page_count] = current_lines

        page_count += 1
        index += 1

    return book

# Second step of text processing.

In [4]:
import re

def delete_short_paragraphs(book: dict, split_by: str = '\n') -> None:
    """Deletes paragraphs shorter than a specified length. It removes the
    `split_by` characters from the text.

    Args:
        book (dict): The book.
        split_by (str, optional): The delimiter used to split paragraphs. Defaults to '\n'.

    Returns:
        str: The modified text with short paragraphs removed.
    """

    empty_pages = []

    for page_index, paragraphs in book.items():
        filtered = []

        for paragraph in paragraphs:
            if split_by not in paragraph and paragraph.strip() != '':
                filtered.append(paragraph)
                continue

            text_splitted = paragraph.split(split_by)
            parts = []
            for part in text_splitted:
                if not part.strip().isdigit():
                    parts.extend(part)

            joined_parts = ''.join(parts)
            if joined_parts.strip() != '':
                filtered.extend([joined_parts])

        if filtered:
            book[page_index] = filtered
        else:
            empty_pages.append(page_index)

    # Remove all keys in the book for pages that are empty
    for page_index in empty_pages:
        del book[page_index]

def find_next_page(book: dict, current_page: int) -> int | None:
        """Finds the next page in the book.

        Args:
            book (dict): The book dictionary.
            current_page (int): The current page number.

        Returns:
            int | None: The next page number or None if not found.
        """
        pages_indexes = list(book.keys())
        for i, page_index in enumerate(pages_indexes):
            if page_index == current_page:
                # Return the next page index if it exists
                if i + 1 < len(pages_indexes):
                    return pages_indexes[i + 1]
                break
        return None
    
def concatenate_paragraphs(book: dict, end_line_tokens: list[str] = ['.', '!', '?']) -> None:
    """If the last page does not finish it's paragraph with ['.', '!', '?'], then it concatenates
    the last paragraph of the last page with the first paragraph of the current page. Both lists can
    be modified in place. Returns True if so.

    Args:
        book (dict): The book dictionary.
        end_line_tokens (list[str]): The list of tokens that defines an end of line.
    """    

    for current_page in book.keys():
        try:
            last_paragraph = book[current_page][-1].rstrip()
        except:
            last_paragraph = None

        if not last_paragraph or last_paragraph == '':
            continue

        if not last_paragraph.endswith(tuple(end_line_tokens)):
            next_page = find_next_page(book, current_page)
            if not next_page:
                break
            
            new_current = last_paragraph + ' ' + book[next_page][0]
            new_next = book[next_page][1:]

            book[current_page][-1] = new_current
            book[next_page] = new_next

def eliminate_pages(book: dict, exclude_pages: list[int]) -> None:
    """Eliminates pages from the book dictionary based on the exclude_pages list.

    Args:
        book (dict): The dictionary containing page numbers as keys and lists of paragraphs as values.
        exclude_pages (list[int]): The list of page numbers to be excluded.
    """    

    for page in exclude_pages:
        if page in book:
            del book[page]

def remove_excluded_sentences(book: dict, excluded_sentences: list[str]) -> None:
    """Removes a paragraph if it is a prohibited word. It's case sensitive.

    Args:
        book (dict): The book.
        excluded_sentences (list[str]): The prohibited words.
    """    

    for page_count, paragraphs in book.items():
        out = []
        for paragraph in paragraphs:
            if paragraph.strip() not in excluded_sentences:
                out.append(paragraph)
                
        book[page_count] = out

def post_clean_up(book: dict) -> None:
    """Removes all end of line breaks and extra whitespaces.

    Args:
        book (dict): The book.
    """

    for page_index, paragraphs in book.items():
        # If paragraphs is a string, convert to list for uniform processing
        if isinstance(paragraphs, str):
            paragraphs = [paragraphs]
        cleaned_paragraphs = []
        for paragraph in paragraphs:
            # Remove end of line breaks and extra whitespaces
            cleaned = re.sub(r'\s+', ' ', paragraph.replace('\n', ' ')).strip()
            if cleaned:
                cleaned_paragraphs.append(cleaned)
        book[page_index] = cleaned_paragraphs


In [14]:
import json

book = get_text_and_page("books/002.pdf")
eliminate_pages(book, books_info['002.pdf']['exclude_pages'])
delete_short_paragraphs(book)
remove_excluded_sentences(book, books_info['exclude_sentences'])
concatenate_paragraphs(book)
post_clean_up(book)

with open('output.json', 'w', encoding='utf-8') as f:
    json.dump(book, f, ensure_ascii=False, indent=2)


# Extracts the cover of .PDF files.

In [2]:
#!/usr/bin/env python3
"""
PDF Cover Image Extractor

This script extracts cover images from all PDF books in the 'books' folder.
It saves the first page of each PDF as a PNG image in a 'covers' folder.

Requirements:
    pip install PyMuPDF Pillow

Usage:
    python extract_covers.py
"""

import os
import fitz  # PyMuPDF
from PIL import Image
import io
from pathlib import Path


def ensure_covers_folder():
    """Create the covers folder if it doesn't exist."""
    covers_path = Path("covers")
    covers_path.mkdir(exist_ok=True)
    return covers_path


def extract_cover_image(pdf_path, output_path):
    """
    Extract the first page of a PDF as a cover image.
    
    Args:
        pdf_path (str): Path to the PDF file
        output_path (str): Path where the cover image will be saved
    
    Returns:
        bool: True if successful, False otherwise
    """
    try:
        # Open the PDF
        pdf_document = fitz.open(pdf_path)
        
        # Get the first page
        first_page = pdf_document[0]
        
        # Convert page to image (pixmap)
        # Higher matrix values = higher resolution
        mat = fitz.Matrix(2.0, 2.0)  # 2x zoom for better quality
        pix = first_page.get_pixmap(matrix=mat)
        
        # Convert pixmap to PIL Image
        img_data = pix.tobytes("png")
        img = Image.open(io.BytesIO(img_data))
        
        # Save as PNG
        img.save(output_path, "PNG", optimize=True)
        
        # Close the PDF
        pdf_document.close()
        
        print(f"✓ Extracted cover for: {Path(pdf_path).name}")
        return True
        
    except Exception as e:
        print(f"✗ Failed to extract cover for {Path(pdf_path).name}: {str(e)}")
        return False


def get_pdf_files(books_folder):
    """
    Get all PDF files from the books folder.
    
    Args:
        books_folder (str): Path to the books folder
        
    Returns:
        list: List of PDF file paths
    """
    books_path = Path(books_folder)
    
    if not books_path.exists():
        print(f"Error: Books folder '{books_folder}' does not exist!")
        return []
    
    pdf_files = list(books_path.glob("*.pdf"))
    pdf_files.extend(list(books_path.glob("*.PDF")))  # Case insensitive
    
    return pdf_files


def sanitize_filename(filename):
    """
    Sanitize filename by removing or replacing problematic characters.
    
    Args:
        filename (str): Original filename
        
    Returns:
        str: Sanitized filename
    """
    # Remove extension and replace problematic characters
    name = Path(filename).stem
    sanitized = "".join(c if c.isalnum() or c in (' ', '-', '_') else '_' for c in name)
    return sanitized.strip()


def main():
    """Main function to process all PDF files in the books folder."""
    books_folder = "books"
    
    print("PDF Cover Image Extractor")
    print("=" * 40)
    
    # Ensure covers folder exists
    covers_path = ensure_covers_folder()
    print(f"Cover images will be saved to: {covers_path.absolute()}")
    
    # Get all PDF files
    pdf_files = get_pdf_files(books_folder)
    
    if not pdf_files:
        print(f"No PDF files found in '{books_folder}' folder.")
        return
    
    print(f"Found {len(pdf_files)} PDF file(s) to process.")
    print("-" * 40)
    
    successful = 0
    failed = 0
    
    # Process each PDF file
    for pdf_file in pdf_files:
        # Create output filename
        sanitized_name = sanitize_filename(pdf_file.name)
        cover_filename = f"{sanitized_name}_cover.png"
        cover_path = covers_path / cover_filename
        
        # Extract cover image
        if extract_cover_image(str(pdf_file), str(cover_path)):
            successful += 1
        else:
            failed += 1
    
    # Summary
    print("-" * 40)
    print(f"Processing complete!")
    print(f"✓ Successful: {successful}")
    print(f"✗ Failed: {failed}")
    print(f"📁 Cover images saved in: {covers_path.absolute()}")


if __name__ == "__main__":
    main()

PDF Cover Image Extractor
Cover images will be saved to: c:\Users\leandro\Projects\logosophy\covers
Found 34 PDF file(s) to process.
----------------------------------------
✓ Extracted cover for: 001.pdf
✓ Extracted cover for: 002.pdf
✓ Extracted cover for: 003.pdf
✓ Extracted cover for: 004.pdf
✓ Extracted cover for: 005.pdf
✓ Extracted cover for: 006.pdf
✓ Extracted cover for: 007.pdf
✓ Extracted cover for: 008.pdf
✓ Extracted cover for: 009.pdf
✓ Extracted cover for: 010.pdf
✓ Extracted cover for: 011.pdf
✓ Extracted cover for: 012.pdf
✓ Extracted cover for: 013.pdf
✓ Extracted cover for: 014.pdf
✓ Extracted cover for: 015.pdf
✓ Extracted cover for: 016.pdf
✓ Extracted cover for: 017.pdf
✓ Extracted cover for: 001.pdf
✓ Extracted cover for: 002.pdf
✓ Extracted cover for: 003.pdf
✓ Extracted cover for: 004.pdf
✓ Extracted cover for: 005.pdf
✓ Extracted cover for: 006.pdf
✓ Extracted cover for: 007.pdf
✓ Extracted cover for: 008.pdf
✓ Extracted cover for: 009.pdf
✓ Extracted cover fo