# Convert EIS Documents to Markdown using Marker

This notebook converts PDF documents to Markdown using `marker`, which provides high-quality extraction with better structure preservation than plain text extraction.

**Key Features:**
- Mirrors directory structure from `documents/` to `marker_conversions/`
- Split/apply/combine strategy for large PDFs
- Excludes images/maps/figures from output
- Parallel processing with timeout handling
- Tracks conversion progress to allow resuming

**Output format:** Markdown files (`.md`) with preserved document structure.

In [2]:
# Install required packages if needed
# !pip install marker-pdf pandas pyarrow tqdm pymupdf

In [3]:
import os
import re
import tempfile
import shutil
import pandas as pd
import fitz  # pymupdf - for splitting PDFs
from pathlib import Path
from tqdm.notebook import tqdm
import logging
from datetime import datetime
import multiprocessing as mp
import subprocess

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Force fork context for multiprocessing (required for Jupyter on macOS)
_mp_ctx = mp.get_context('fork')

In [4]:
# Configuration
REPO_ROOT = Path("../").resolve()
METADATA_DIR = REPO_ROOT / "metadata"
DOCUMENTS_DIR = REPO_ROOT / "documents"  # Source PDFs
OUTPUT_DIR = REPO_ROOT / "marker_conversions"  # Output markdown files

# Input metadata file
DOC_RECORD_FILE = METADATA_DIR / "eis_document_record_api.parquet"

# Failed files tracking
FAILED_FILES_LOG = METADATA_DIR / "marker_conversion_failures.csv"

print(f"Repository root: {REPO_ROOT}")
print(f"Documents directory: {DOCUMENTS_DIR}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Failed files log: {FAILED_FILES_LOG}")

Repository root: /Users/admin-tascott/Documents/GitHub/eis_documents/enepa_repository
Documents directory: /Users/admin-tascott/Documents/GitHub/eis_documents/enepa_repository/documents
Output directory: /Users/admin-tascott/Documents/GitHub/eis_documents/enepa_repository/marker_conversions
Failed files log: /Users/admin-tascott/Documents/GitHub/eis_documents/enepa_repository/metadata/marker_conversion_failures.csv


## Helper Functions

In [6]:
def sanitize_filename(filename: str) -> str:
    """
    Sanitize filename to match existing convention.
    - Remove special characters: ( ) & , ~
    - Replace spaces with underscores
    - Normalize PDF extension
    """
    clean = re.sub(r'[()&,~\/]', '', filename)
    clean = re.sub(r'[\s_]+', '_', clean)
    clean = re.sub(r'\.PDF$', '.pdf', clean, flags=re.IGNORECASE)
    clean = re.sub(r'\.pdf\.pdf$', '.pdf', clean, flags=re.IGNORECASE)
    clean = clean.strip('_')
    return clean


def build_local_filename(ceq_number, original_filename: str) -> str:
    """
    Build the local filename following existing convention.
    Format: {CEQ_NUMBER}_{sanitized_filename}
    """
    sanitized = sanitize_filename(original_filename)
    return f"{ceq_number}_{sanitized}"


def get_year_from_ceq(ceq_number) -> str:
    """Extract year from CEQ Number (first 4 digits)."""
    return str(ceq_number)[:4]

In [7]:
def split_pdf(pdf_path: Path, output_dir: Path, pages_per_chunk: int = 50) -> list:
    """
    Split a PDF into smaller chunks for processing.
    
    Args:
        pdf_path: Path to source PDF
        output_dir: Directory to write chunk files
        pages_per_chunk: Number of pages per chunk
    
    Returns:
        List of paths to chunk PDFs
    """
    doc = fitz.open(pdf_path)
    num_pages = len(doc)
    chunk_paths = []
    
    for start_page in range(0, num_pages, pages_per_chunk):
        end_page = min(start_page + pages_per_chunk, num_pages)
        chunk_doc = fitz.open()  # New empty PDF
        chunk_doc.insert_pdf(doc, from_page=start_page, to_page=end_page - 1)
        
        chunk_path = output_dir / f"chunk_{start_page:05d}_{end_page:05d}.pdf"
        chunk_doc.save(str(chunk_path))
        chunk_doc.close()
        chunk_paths.append(chunk_path)
    
    doc.close()
    return chunk_paths


def get_pdf_page_count(pdf_path: Path) -> int:
    """Get the number of pages in a PDF."""
    try:
        doc = fitz.open(pdf_path)
        count = len(doc)
        doc.close()
        return count
    except:
        return 0

In [8]:
def run_marker_on_pdf(pdf_path: Path, output_dir: Path, timeout_seconds: int = 300) -> tuple:
    """
    Run marker on a single PDF file.
    
    Args:
        pdf_path: Path to PDF file
        output_dir: Directory where marker will write output
        timeout_seconds: Maximum time to wait for marker
    
    Returns:
        Tuple of (success: bool, markdown_content: str, error: str)
    """
    try:
        # Run marker via command line
        # --disable_image_extraction: skip images (faster, smaller output)
        # --output_format markdown: output as markdown
        result = subprocess.run(
            [
                "marker_single",
                str(pdf_path),
                "--output_dir", str(output_dir),
                "--output_format", "markdown",
                "--disable_image_extraction",
            ],
            capture_output=True,
            text=True,
            timeout=timeout_seconds
        )
        
        if result.returncode != 0:
            return False, "", f"Marker failed: {result.stderr[:500]}"
        
        # Find the output markdown file
        # Marker creates: output_dir/pdf_stem/pdf_stem.md
        pdf_stem = pdf_path.stem
        md_path = output_dir / pdf_stem / f"{pdf_stem}.md"
        
        if not md_path.exists():
            # Try to find any .md file in the output
            md_files = list(output_dir.rglob("*.md"))
            if md_files:
                md_path = md_files[0]
            else:
                return False, "", "No markdown output found"
        
        with open(md_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        return True, content, None
        
    except subprocess.TimeoutExpired:
        return False, "", f"Timeout after {timeout_seconds}s"
    except Exception as e:
        return False, "", str(e)[:500]

In [9]:
def remove_images_from_markdown(markdown_text: str) -> str:
    """
    Remove image references and figure captions from markdown.
    
    Removes:
    - ![alt](path) image syntax
    - <img> HTML tags
    - Figure/Map/Image caption lines
    """
    # Remove markdown image syntax: ![alt text](path)
    text = re.sub(r'!\[[^\]]*\]\([^)]+\)', '', markdown_text)
    
    # Remove HTML img tags
    text = re.sub(r'<img[^>]*>', '', text, flags=re.IGNORECASE)
    
    # Remove figure/map/image caption lines (lines that are just "Figure X.X" etc.)
    text = re.sub(r'^\s*(Figure|Map|Image|Photo|Photograph|Exhibit)\s+[\d.\-]+[^\n]*$', '', text, flags=re.MULTILINE | re.IGNORECASE)
    
    # Remove multiple consecutive blank lines
    text = re.sub(r'\n{3,}', '\n\n', text)
    
    return text.strip()

In [10]:
def convert_pdf_with_marker(pdf_path: Path, output_path: Path, 
                            pages_per_chunk: int = 50,
                            chunk_timeout: int = 300) -> dict:
    """
    Convert a PDF to markdown using marker with split/apply/combine for large files.
    
    Args:
        pdf_path: Source PDF path
        output_path: Output markdown path
        pages_per_chunk: Pages per chunk for splitting
        chunk_timeout: Timeout per chunk in seconds
    
    Returns:
        Dict with conversion status
    """
    if not pdf_path.exists():
        return {"converted": False, "num_pages": 0, "error": "Source file not found"}
    
    try:
        num_pages = get_pdf_page_count(pdf_path)
        if num_pages == 0:
            return {"converted": False, "num_pages": 0, "error": "Empty or unreadable PDF"}
        
        # Create temp directory for processing
        with tempfile.TemporaryDirectory() as temp_dir:
            temp_path = Path(temp_dir)
            
            # Decide whether to split
            if num_pages <= pages_per_chunk:
                # Small file - process directly
                chunks = [pdf_path]
                chunk_dirs = [temp_path / "output"]
                chunk_dirs[0].mkdir()
            else:
                # Large file - split into chunks
                split_dir = temp_path / "splits"
                split_dir.mkdir()
                chunks = split_pdf(pdf_path, split_dir, pages_per_chunk)
                chunk_dirs = [temp_path / f"output_{i}" for i in range(len(chunks))]
                for d in chunk_dirs:
                    d.mkdir()
            
            # Process each chunk
            markdown_parts = []
            for i, (chunk_path, chunk_output_dir) in enumerate(zip(chunks, chunk_dirs)):
                success, content, error = run_marker_on_pdf(chunk_path, chunk_output_dir, chunk_timeout)
                
                if not success:
                    return {
                        "converted": False, 
                        "num_pages": num_pages, 
                        "error": f"Chunk {i+1}/{len(chunks)} failed: {error}"
                    }
                
                markdown_parts.append(content)
            
            # Combine chunks
            if len(markdown_parts) > 1:
                combined_markdown = "\n\n---\n\n".join(markdown_parts)
            else:
                combined_markdown = markdown_parts[0]
            
            # Remove images/figures
            clean_markdown = remove_images_from_markdown(combined_markdown)
            
            # Write output
            output_path.parent.mkdir(parents=True, exist_ok=True)
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(clean_markdown)
            
            return {"converted": True, "num_pages": num_pages, "error": None}
    
    except Exception as e:
        return {"converted": False, "num_pages": 0, "error": str(e)[:500]}

## Load Document Records

In [12]:
def load_document_records():
    """Load document records from the API metadata."""
    if DOC_RECORD_FILE.exists():
        return pd.read_parquet(DOC_RECORD_FILE)
    else:
        raise FileNotFoundError(
            f"Document records not found at {DOC_RECORD_FILE}.\n"
            f"Run fetch_eis_records_api.ipynb first."
        )


def load_failed_files() -> set:
    """Load set of attachment IDs that previously failed."""
    if FAILED_FILES_LOG.exists():
        df = pd.read_csv(FAILED_FILES_LOG)
        return set(df['attachmentId'].astype(str))
    return set()


def append_failed_file(attachment_id, ceq_number, source_file, error):
    """Append a failed file to the log."""
    file_exists = FAILED_FILES_LOG.exists()
    with open(FAILED_FILES_LOG, 'a', encoding='utf-8') as f:
        if not file_exists:
            f.write("attachmentId,ceqNumber,source_file,error,timestamp\n")
        error_clean = str(error).replace('"', "'").replace('\n', ' ')[:200]
        f.write(f'{attachment_id},{ceq_number},"{source_file}","{error_clean}",{datetime.now().isoformat()}\n')

In [13]:
# Load document records
doc_df = load_document_records()
print(f"Loaded {len(doc_df)} document records")

# Add helper columns
doc_df['year'] = doc_df['ceqNumber'].astype(str).str[:4]
doc_df['localFilename'] = doc_df.apply(
    lambda row: build_local_filename(
        row['ceqNumber'], 
        row['name'] or row['fileNameForDownload'] or f"{row['attachmentId']}.pdf"
    ),
    axis=1
)

# Build source and output paths
doc_df['sourcePath'] = doc_df.apply(
    lambda row: DOCUMENTS_DIR / row['year'] / row['localFilename'],
    axis=1
)
doc_df['outputPath'] = doc_df.apply(
    lambda row: OUTPUT_DIR / row['year'] / (row['localFilename'].replace('.pdf', '.md').replace('.PDF', '.md')),
    axis=1
)

display(doc_df[['ceqNumber', 'year', 'localFilename', 'sourcePath', 'outputPath']].head())

Loaded 45704 document records


Unnamed: 0,ceqNumber,year,localFilename,sourcePath,outputPath
0,20250186,2025,20250186_LoMo_FRR_Comprehensive_Study_Draft_Re...,/Users/admin-tascott/Documents/GitHub/eis_docu...,/Users/admin-tascott/Documents/GitHub/eis_docu...
1,20250186,2025,20250186_LoMo_System_Plan_-_Basis_of_Estimate.pdf,/Users/admin-tascott/Documents/GitHub/eis_docu...,/Users/admin-tascott/Documents/GitHub/eis_docu...
2,20250186,2025,20250186_Appendix_A.1_LoMo_FRM_Past_Performanc...,/Users/admin-tascott/Documents/GitHub/eis_docu...,/Users/admin-tascott/Documents/GitHub/eis_docu...
3,20250186,2025,20250186_Appendix_A.2.1_LoMo_RAS_Calibration_O...,/Users/admin-tascott/Documents/GitHub/eis_docu...,/Users/admin-tascott/Documents/GitHub/eis_docu...
4,20250186,2025,20250186_Appendix_A.2.2_LoMo_RAS_Calibration_K...,/Users/admin-tascott/Documents/GitHub/eis_docu...,/Users/admin-tascott/Documents/GitHub/eis_docu...


In [14]:
# Check which source files exist
doc_df['sourceExists'] = doc_df['sourcePath'].apply(lambda p: p.exists())
print(f"Source files found: {doc_df['sourceExists'].sum()} / {len(doc_df)}")

# Documents with source files available for conversion
to_convert = doc_df[doc_df['sourceExists']].copy()
print(f"Documents available for conversion: {len(to_convert)}")

print(f"\nBy year:")
print(to_convert['year'].value_counts().sort_index())

Source files found: 45559 / 45704
Documents available for conversion: 45559

By year:
year
1987       2
1988       1
1990       2
1991       4
1992       3
1993       3
1994      10
1995      20
1996      59
1997      82
1998      58
1999     202
2000     369
2001     481
2002     596
2003     562
2004     631
2005     755
2006     712
2007     912
2008     911
2009     894
2010     661
2011     406
2012     991
2013    3007
2014    2918
2015    3991
2016    3651
2017    2697
2018    3239
2019    2865
2020    3280
2021    2309
2022    2206
2023    1633
2024    2498
2025    1938
Name: count, dtype: int64


## Conversion Settings

In [16]:
# ============================================
# CONVERSION SETTINGS - MODIFY AS NEEDED
# ============================================

# Filter by year (set to None to convert all years)
YEAR_FILTER = None
YEAR_FILTER = [2019]  # Example: specific year

# Maximum number of files to convert (set to None for all)
MAX_CONVERSIONS = None
MAX_CONVERSIONS = 10  # Start with a small test batch

# Set to True to retry previously failed files
RETRY_FAILURES = False

# Pages per chunk for splitting large PDFs
PAGES_PER_CHUNK = 10

# Timeout per chunk in seconds (marker is slower than pymupdf)
CHUNK_TIMEOUT_SECONDS = 300  # 5 minutes per 50 pages

# Overall timeout per PDF (for the multiprocessing wrapper)
PDF_TIMEOUT_SECONDS = 1800  # 30 minutes max per PDF

print(f"Settings:")
print(f"  YEAR_FILTER: {YEAR_FILTER}")
print(f"  MAX_CONVERSIONS: {MAX_CONVERSIONS}")
print(f"  RETRY_FAILURES: {RETRY_FAILURES}")
print(f"  PAGES_PER_CHUNK: {PAGES_PER_CHUNK}")
print(f"  CHUNK_TIMEOUT_SECONDS: {CHUNK_TIMEOUT_SECONDS}")
print(f"  PDF_TIMEOUT_SECONDS: {PDF_TIMEOUT_SECONDS}")

Settings:
  YEAR_FILTER: [2019]
  MAX_CONVERSIONS: 10
  RETRY_FAILURES: False
  PAGES_PER_CHUNK: 10
  CHUNK_TIMEOUT_SECONDS: 300
  PDF_TIMEOUT_SECONDS: 1800


In [17]:
# Build conversion queue
conversion_queue = to_convert.copy()

# Apply year filter
if YEAR_FILTER:
    year_filter_str = [str(y) for y in YEAR_FILTER]
    conversion_queue = conversion_queue[conversion_queue['year'].isin(year_filter_str)]
    print(f"Filtered to years {YEAR_FILTER}: {len(conversion_queue)} documents")

# Skip files that already have .md output
conversion_queue['outputExists'] = conversion_queue['outputPath'].apply(lambda p: p.exists())
existing_count = conversion_queue['outputExists'].sum()
print(f"Already converted (md exists): {existing_count}")
conversion_queue = conversion_queue[~conversion_queue['outputExists']]

# Skip previously failed files (unless RETRY_FAILURES is True)
if not RETRY_FAILURES:
    failed_ids = load_failed_files()
    if failed_ids:
        conversion_queue['previouslyFailed'] = conversion_queue['attachmentId'].astype(str).isin(failed_ids)
        failed_count = conversion_queue['previouslyFailed'].sum()
        print(f"Previously failed (skipping): {failed_count}")
        conversion_queue = conversion_queue[~conversion_queue['previouslyFailed']]

# Apply max conversions limit
if MAX_CONVERSIONS and len(conversion_queue) > MAX_CONVERSIONS:
    conversion_queue = conversion_queue.head(MAX_CONVERSIONS)
    print(f"Limited to {MAX_CONVERSIONS} conversions")

print(f"\nFinal conversion queue: {len(conversion_queue)} files")

Filtered to years [2019]: 2865 documents
Already converted (md exists): 0
Previously failed (skipping): 1
Limited to 10 conversions

Final conversion queue: 10 files


## Create Output Directory Structure

In [19]:
# Create output directory structure mirroring documents/
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Get all year directories from documents
year_dirs = [d for d in DOCUMENTS_DIR.iterdir() if d.is_dir() and d.name.isdigit()]
print(f"Found {len(year_dirs)} year directories in documents/")

# Create corresponding directories in marker_conversions/
for year_dir in year_dirs:
    output_year_dir = OUTPUT_DIR / year_dir.name
    output_year_dir.mkdir(exist_ok=True)

print(f"Created directory structure in {OUTPUT_DIR}")

Found 38 year directories in documents/
Created directory structure in /Users/admin-tascott/Documents/GitHub/eis_documents/enepa_repository/marker_conversions


## Run Conversions

In [21]:
def _convert_in_process(args: tuple, result_queue, pages_per_chunk: int, chunk_timeout: int):
    """Worker function that runs marker conversion in a separate process."""
    pdf_path, output_path, ceq_number, attachment_id = args
    pdf_path = Path(pdf_path)
    output_path = Path(output_path)
    
    try:
        result = convert_pdf_with_marker(
            pdf_path, output_path,
            pages_per_chunk=pages_per_chunk,
            chunk_timeout=chunk_timeout
        )
        result_queue.put(result)
    except Exception as e:
        result_queue.put({"converted": False, "num_pages": 0, "error": str(e)[:500]})


def convert_with_timeout(args: tuple, timeout_seconds: int, 
                         pages_per_chunk: int, chunk_timeout: int) -> dict:
    """Run conversion in a subprocess that can be killed on timeout."""
    result_queue = _mp_ctx.Queue()
    proc = _mp_ctx.Process(
        target=_convert_in_process, 
        args=(args, result_queue, pages_per_chunk, chunk_timeout)
    )
    proc.start()
    proc.join(timeout=timeout_seconds)
    
    if proc.is_alive():
        proc.terminate()
        proc.join(timeout=5)
        if proc.is_alive():
            proc.kill()
            proc.join()
        return {"converted": False, "num_pages": 0, "error": f"TIMEOUT after {timeout_seconds}s"}
    
    try:
        return result_queue.get_nowait()
    except:
        return {"converted": False, "num_pages": 0, "error": "Process ended without result"}


def run_conversions(queue: pd.DataFrame, timeout_seconds: int = 1800,
                    pages_per_chunk: int = 50, chunk_timeout: int = 300) -> dict:
    """Run marker extraction with hard timeouts that kill hung processes."""
    success_count = 0
    fail_count = 0
    total_pages = 0
    
    pbar = tqdm(queue.iterrows(), total=len(queue), desc="Converting")
    
    for idx, row in pbar:
        filename = Path(row['sourcePath']).name
        short_name = filename[:40] + "..." if len(filename) > 40 else filename
        pbar.set_postfix_str(short_name)
        
        args = (str(row['sourcePath']), str(row['outputPath']), row['ceqNumber'], row['attachmentId'])
        
        try:
            result = convert_with_timeout(args, timeout_seconds, pages_per_chunk, chunk_timeout)
            
            if result.get('converted'):
                success_count += 1
                total_pages += result.get('num_pages', 0)
            else:
                fail_count += 1
                append_failed_file(
                    row['attachmentId'],
                    row['ceqNumber'],
                    str(row['sourcePath']),
                    result.get('error', 'Unknown error')
                )
                logger.warning(f"Failed: {short_name} - {result.get('error', '')[:60]}")
                
        except Exception as e:
            fail_count += 1
            append_failed_file(
                row['attachmentId'],
                row['ceqNumber'],
                str(row['sourcePath']),
                f"EXCEPTION: {str(e)[:200]}"
            )
            logger.error(f"Exception: {short_name} - {str(e)[:60]}")
    
    return {
        'success': success_count,
        'failed': fail_count,
        'total_pages': total_pages
    }

In [22]:
# Run the conversions
if len(conversion_queue) > 0:
    print(f"Starting marker conversion of {len(conversion_queue)} files...")
    print(f"Timeout per file: {PDF_TIMEOUT_SECONDS}s, per chunk: {CHUNK_TIMEOUT_SECONDS}s")
    print(f"Pages per chunk: {PAGES_PER_CHUNK}")
    print(f"Failures logged to: {FAILED_FILES_LOG}")
    print()
    
    import time
    start_time = time.time()
    
    results = run_conversions(
        conversion_queue, 
        timeout_seconds=PDF_TIMEOUT_SECONDS,
        pages_per_chunk=PAGES_PER_CHUNK,
        chunk_timeout=CHUNK_TIMEOUT_SECONDS
    )
    
    elapsed = time.time() - start_time
    
    print(f"\n=== Conversion Summary ===")
    print(f"Time elapsed: {elapsed/60:.1f} minutes")
    print(f"Successful: {results['success']}")
    print(f"Failed: {results['failed']}")
    print(f"Total pages: {results['total_pages']:,}")
    if elapsed > 0:
        print(f"Rate: {(results['success'] + results['failed'])/elapsed:.2f} docs/sec")
        print(f"Rate: {results['total_pages']/elapsed:.1f} pages/sec")
    
    if results['failed'] > 0:
        print(f"\nSee {FAILED_FILES_LOG} for failure details")
else:
    print("No files to convert.")

Starting marker conversion of 10 files...
Timeout per file: 1800s, per chunk: 300s
Pages per chunk: 10
Failures logged to: /Users/admin-tascott/Documents/GitHub/eis_documents/enepa_repository/metadata/marker_conversion_failures.csv



Converting:   0%|          | 0/10 [00:00<?, ?it/s]

Process ForkProcess-5:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/var/folders/_0/grm8p2890sj6kq7p40wjd3pw0000gn/T/ipykernel_22826/2461315604.py", line 8, in _convert_in_process
    result = convert_pdf_with_marker(
             ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/_0/grm8p2890sj6kq7p40wjd3pw0000gn/T/ipykernel_22826/673105759.py", line 46, in convert_pdf_with_marker
    success, content, error = run_marker_on_pdf(chunk_path, chunk_output_dir, chunk_timeout)
                              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


KeyboardInterrupt: 

  File "/var/folders/_0/grm8p2890sj6kq7p40wjd3pw0000gn/T/ipykernel_22826/1794098468.py", line 17, in run_marker_on_pdf
    result = subprocess.run(
             ^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/subprocess.py", line 550, in run
    stdout, stderr = process.communicate(input, timeout=timeout)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/subprocess.py", line 1209, in communicate
    stdout, stderr = self._communicate(input, endtime, timeout)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/subprocess.py", line 2133, in _communicate
    data = os.read(key.fd, 32768)
           ^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


## Verify Conversions

In [None]:
def verify_conversions():
    """Quick verification of conversion status."""
    doc_df = load_document_records()
    
    # Build paths
    doc_df['year'] = doc_df['ceqNumber'].astype(str).str[:4]
    doc_df['localFilename'] = doc_df.apply(
        lambda row: build_local_filename(
            row['ceqNumber'], 
            row['name'] or row['fileNameForDownload'] or f"{row['attachmentId']}.pdf"
        ),
        axis=1
    )
    doc_df['sourcePath'] = doc_df.apply(
        lambda row: DOCUMENTS_DIR / row['year'] / row['localFilename'],
        axis=1
    )
    doc_df['outputPath'] = doc_df.apply(
        lambda row: OUTPUT_DIR / row['year'] / (row['localFilename'].replace('.pdf', '.md').replace('.PDF', '.md')),
        axis=1
    )
    
    # Check status
    doc_df['sourceExists'] = doc_df['sourcePath'].apply(lambda p: p.exists())
    doc_df['outputExists'] = doc_df['outputPath'].apply(lambda p: p.exists())
    
    with_source = doc_df[doc_df['sourceExists']]
    total = len(with_source)
    converted = with_source['outputExists'].sum()
    
    # Count failures
    failed_ids = load_failed_files()
    
    print(f"=== Marker Conversion Status ===")
    print(f"Source PDFs: {total}")
    print(f"Converted: {converted} ({100*converted/total:.1f}%)")
    print(f"Failed: {len(failed_ids)}")
    print(f"Remaining: {total - converted - len(failed_ids)}")

verify_conversions()

## Sample Output Inspection

In [None]:
# Inspect a sample converted file
def show_sample_output(n_chars: int = 3000):
    """Display the beginning of a random converted markdown file."""
    md_files = list(OUTPUT_DIR.glob("**/*.md"))
    
    if not md_files:
        print("No converted files found.")
        return
    
    import random
    sample_file = random.choice(md_files)
    
    print(f"File: {sample_file.name}")
    print(f"Size: {sample_file.stat().st_size:,} bytes")
    print("=" * 60)
    
    with open(sample_file, 'r', encoding='utf-8') as f:
        print(f.read(n_chars))

# Uncomment to see a sample:
# show_sample_output()