# Convert EIS Documents to Markdown using Marker

This notebook converts PDF documents to markdown text using the `marker` library, which provides high-quality extraction with proper handling of:
- Document structure (headings, lists)
- Tables
- Equations
- Multi-column layouts

**Key Features:**
- Mirrors directory structure from `documents/` to `marker_conversions/`
- Disables image extraction (excludes maps, figures, photos)
- Automatically detects and applies OCR for scanned documents
- Tracks conversion progress to allow resuming

This replaces the text extraction previously done in `make_filter_text_tables.R`.

In [None]:
# Install required packages if needed
# !pip install marker-pdf pandas pyarrow tqdm pymupdf

In [None]:
import os
import re
import pandas as pd
import fitz  # pymupdf - for OCR detection
from pathlib import Path
from tqdm.notebook import tqdm
import logging
from datetime import datetime
import subprocess
import json

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [None]:
# Configuration
REPO_ROOT = Path("../").resolve()
METADATA_DIR = REPO_ROOT / "metadata"
DOCUMENTS_DIR = REPO_ROOT / "documents"  # Source PDFs (may be symlink to Box)
OUTPUT_DIR = REPO_ROOT / "marker_conversions"  # Output markdown files

# Input metadata file
DOC_RECORD_FILE = METADATA_DIR / "eis_document_record_api.parquet"

# Conversion tracking file
CONVERSION_STATUS_FILE = METADATA_DIR / "marker_conversion_status.pkl"

# OCR detection threshold: if extracted text has fewer chars per page than this, likely scanned
OCR_THRESHOLD_CHARS_PER_PAGE = 100

print(f"Repository root: {REPO_ROOT}")
print(f"Documents directory: {DOCUMENTS_DIR}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Is documents symlink: {DOCUMENTS_DIR.is_symlink()}")
if DOCUMENTS_DIR.is_symlink():
    print(f"Symlink target: {DOCUMENTS_DIR.resolve()}")

## Helper Functions

In [None]:
def sanitize_filename(filename: str) -> str:
    """
    Sanitize filename to match existing convention.
    - Remove special characters: ( ) & , ~
    - Replace spaces with underscores
    - Normalize PDF extension
    """
    clean = re.sub(r'[()&,~\/]', '', filename)
    clean = re.sub(r'[\s_]+', '_', clean)
    clean = re.sub(r'\.PDF$', '.pdf', clean, flags=re.IGNORECASE)
    clean = re.sub(r'\.pdf\.pdf$', '.pdf', clean, flags=re.IGNORECASE)
    clean = clean.strip('_')
    return clean


def build_local_filename(ceq_number, original_filename: str) -> str:
    """
    Build the local filename following existing convention.
    Format: {CEQ_NUMBER}_{sanitized_filename}
    """
    sanitized = sanitize_filename(original_filename)
    return f"{ceq_number}_{sanitized}"


def get_year_from_ceq(ceq_number) -> str:
    """Extract year from CEQ Number (first 4 digits)."""
    return str(ceq_number)[:4]

In [None]:
def needs_ocr(pdf_path: Path) -> bool:
    """
    Check if a PDF needs OCR by attempting to extract text.
    
    Returns True if:
    - Text extraction yields very little text (likely scanned)
    - The PDF contains mostly images
    
    Args:
        pdf_path: Path to the PDF file
    
    Returns:
        True if OCR is needed, False otherwise
    """
    try:
        doc = fitz.open(pdf_path)
        total_chars = 0
        num_pages = len(doc)
        
        if num_pages == 0:
            doc.close()
            return True
        
        # Sample up to 10 pages to check for text
        pages_to_check = min(num_pages, 10)
        for i in range(pages_to_check):
            page = doc[i]
            text = page.get_text()
            # Count non-whitespace characters
            total_chars += len(text.strip())
        
        doc.close()
        
        # Calculate average chars per page
        avg_chars_per_page = total_chars / pages_to_check
        
        # If very little text, likely needs OCR
        return avg_chars_per_page < OCR_THRESHOLD_CHARS_PER_PAGE
        
    except Exception as e:
        logger.warning(f"Error checking OCR need for {pdf_path}: {e}")
        # If we can't check, assume OCR might be needed
        return True

In [None]:
def convert_pdf_with_marker(pdf_path: Path, output_path: Path, force_ocr: bool = False) -> dict:
    """
    Convert a PDF to markdown using marker.
    
    Args:
        pdf_path: Path to input PDF
        output_path: Path for output markdown file
        force_ocr: If True, force OCR processing
    
    Returns:
        Dict with conversion status: {success, ocr_used, error, output_file}
    """
    try:
        # Ensure output directory exists
        output_path.parent.mkdir(parents=True, exist_ok=True)
        
        # Build marker command
        # marker outputs to a directory, creating {filename}/{filename}.md
        # We'll use a temp approach and rename
        temp_output_dir = output_path.parent / f"_temp_{output_path.stem}"
        
        cmd = [
            "marker_single",
            str(pdf_path),
            str(temp_output_dir),
            "--disable_image_extraction"
        ]
        
        if force_ocr:
            cmd.append("--force_ocr")
        
        # Run marker
        result = subprocess.run(
            cmd,
            capture_output=True,
            text=True,
            timeout=600  # 10 minute timeout per file
        )
        
        if result.returncode != 0:
            return {
                "success": False,
                "ocr_used": force_ocr,
                "error": result.stderr[:500] if result.stderr else "Unknown error",
                "output_file": None
            }
        
        # Find the output markdown file
        md_files = list(temp_output_dir.glob("**/*.md"))
        if not md_files:
            return {
                "success": False,
                "ocr_used": force_ocr,
                "error": "No markdown file generated",
                "output_file": None
            }
        
        # Move the markdown file to final location
        source_md = md_files[0]
        source_md.rename(output_path)
        
        # Clean up temp directory
        import shutil
        if temp_output_dir.exists():
            shutil.rmtree(temp_output_dir)
        
        return {
            "success": True,
            "ocr_used": force_ocr,
            "error": None,
            "output_file": str(output_path)
        }
        
    except subprocess.TimeoutExpired:
        return {
            "success": False,
            "ocr_used": force_ocr,
            "error": "Timeout (>10 minutes)",
            "output_file": None
        }
    except Exception as e:
        return {
            "success": False,
            "ocr_used": force_ocr,
            "error": str(e)[:500],
            "output_file": None
        }

## Load Document Records

In [None]:
def load_document_records():
    """Load document records from the API metadata."""
    if DOC_RECORD_FILE.exists():
        return pd.read_parquet(DOC_RECORD_FILE)
    else:
        raise FileNotFoundError(
            f"Document records not found at {DOC_RECORD_FILE}.\n"
            f"Run fetch_eis_records_api.ipynb first."
        )


def load_conversion_status():
    """Load existing conversion status tracking."""
    if CONVERSION_STATUS_FILE.exists():
        return pd.read_pickle(CONVERSION_STATUS_FILE)
    return pd.DataFrame(columns=[
        'ceqNumber', 'attachmentId', 'source_file', 'output_file',
        'converted', 'ocr_used', 'error', 'timestamp'
    ])


def save_conversion_status(status_df: pd.DataFrame):
    """Save conversion status tracking."""
    status_df.to_pickle(CONVERSION_STATUS_FILE)
    # Also save CSV for easy inspection
    status_df.to_csv(METADATA_DIR / "marker_conversion_status.csv", index=False)

In [None]:
# Load document records
doc_df = load_document_records()
print(f"Loaded {len(doc_df)} document records")

# Add helper columns
doc_df['year'] = doc_df['ceqNumber'].astype(str).str[:4]
doc_df['localFilename'] = doc_df.apply(
    lambda row: build_local_filename(
        row['ceqNumber'], 
        row['name'] or row['fileNameForDownload'] or f"{row['attachmentId']}.pdf"
    ),
    axis=1
)

# Build source and output paths
doc_df['sourcePath'] = doc_df.apply(
    lambda row: DOCUMENTS_DIR / row['year'] / row['localFilename'],
    axis=1
)
doc_df['outputPath'] = doc_df.apply(
    lambda row: OUTPUT_DIR / row['year'] / row['localFilename'].replace('.pdf', '.md').replace('.PDF', '.md'),
    axis=1
)

display(doc_df[['ceqNumber', 'year', 'localFilename', 'sourcePath', 'outputPath']].head())

In [None]:
# Check which source files exist
doc_df['sourceExists'] = doc_df['sourcePath'].apply(lambda p: p.exists())
print(f"Source files found: {doc_df['sourceExists'].sum()} / {len(doc_df)}")

# Check which have already been converted
doc_df['alreadyConverted'] = doc_df['outputPath'].apply(lambda p: p.exists())
print(f"Already converted: {doc_df['alreadyConverted'].sum()}")

# Documents needing conversion
to_convert = doc_df[doc_df['sourceExists'] & ~doc_df['alreadyConverted']].copy()
print(f"\nDocuments to convert: {len(to_convert)}")

print(f"\nBy year:")
print(to_convert['year'].value_counts().sort_index())

## Conversion Settings

In [None]:
# ============================================
# CONVERSION SETTINGS - MODIFY AS NEEDED
# ============================================

# Filter by year (set to None to convert all years)
YEAR_FILTER = None
# YEAR_FILTER = [2024, 2025]  # Example: only recent years

# Maximum number of files to convert (set to None for all)
# Useful for testing
MAX_CONVERSIONS = None
# MAX_CONVERSIONS = 10  # Example: test with 10 files

# Skip OCR detection and always/never force OCR
# None = auto-detect, True = always OCR, False = never OCR
FORCE_OCR_MODE = None

print(f"Settings:")
print(f"  YEAR_FILTER: {YEAR_FILTER}")
print(f"  MAX_CONVERSIONS: {MAX_CONVERSIONS}")
print(f"  FORCE_OCR_MODE: {FORCE_OCR_MODE}")

In [None]:
# Apply filters to conversion queue
conversion_queue = to_convert.copy()

if YEAR_FILTER:
    year_filter_str = [str(y) for y in YEAR_FILTER]
    conversion_queue = conversion_queue[conversion_queue['year'].isin(year_filter_str)]
    print(f"Filtered to years {YEAR_FILTER}: {len(conversion_queue)} documents")

if MAX_CONVERSIONS and len(conversion_queue) > MAX_CONVERSIONS:
    conversion_queue = conversion_queue.head(MAX_CONVERSIONS)
    print(f"Limited to {MAX_CONVERSIONS} conversions")

print(f"\nFinal conversion queue: {len(conversion_queue)} files")

## Create Output Directory Structure

In [None]:
# Create output directory structure mirroring documents/
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Get all year directories from documents
year_dirs = [d for d in DOCUMENTS_DIR.iterdir() if d.is_dir() and d.name.isdigit()]
print(f"Found {len(year_dirs)} year directories in documents/")

# Create corresponding directories in marker_conversions/
for year_dir in year_dirs:
    output_year_dir = OUTPUT_DIR / year_dir.name
    output_year_dir.mkdir(exist_ok=True)

print(f"Created directory structure in {OUTPUT_DIR}")

## Run Conversions

In [None]:
def run_conversions(queue: pd.DataFrame) -> pd.DataFrame:
    """
    Run marker conversions on all files in the queue.
    
    Args:
        queue: DataFrame with sourcePath and outputPath columns
    
    Returns:
        DataFrame with conversion results
    """
    results = []
    
    for idx, row in tqdm(queue.iterrows(), total=len(queue), desc="Converting"):
        source_path = row['sourcePath']
        output_path = row['outputPath']
        
        # Determine if OCR is needed
        if FORCE_OCR_MODE is None:
            use_ocr = needs_ocr(source_path)
            if use_ocr:
                logger.info(f"OCR needed for {source_path.name}")
        else:
            use_ocr = FORCE_OCR_MODE
        
        # Run conversion
        result = convert_pdf_with_marker(source_path, output_path, force_ocr=use_ocr)
        
        results.append({
            'ceqNumber': row['ceqNumber'],
            'attachmentId': row['attachmentId'],
            'source_file': str(source_path),
            'output_file': result['output_file'],
            'converted': result['success'],
            'ocr_used': result['ocr_used'],
            'error': result['error'],
            'timestamp': datetime.now().isoformat()
        })
        
        # Save progress periodically (every 50 files)
        if len(results) % 50 == 0:
            temp_results = pd.DataFrame(results)
            existing_status = load_conversion_status()
            combined = pd.concat([existing_status, temp_results], ignore_index=True)
            combined = combined.drop_duplicates(subset=['attachmentId'], keep='last')
            save_conversion_status(combined)
            logger.info(f"Progress saved: {len(results)} files processed")
    
    return pd.DataFrame(results)

In [None]:
# Run the conversions
if len(conversion_queue) > 0:
    print(f"Starting conversion of {len(conversion_queue)} files...")
    print(f"This may take a while. Progress is saved every 50 files.")
    
    conversion_results = run_conversions(conversion_queue)
    
    # Merge with existing status
    existing_status = load_conversion_status()
    combined_status = pd.concat([existing_status, conversion_results], ignore_index=True)
    combined_status = combined_status.drop_duplicates(subset=['attachmentId'], keep='last')
    save_conversion_status(combined_status)
    
    # Summary
    success_count = conversion_results['converted'].sum()
    fail_count = len(conversion_results) - success_count
    ocr_count = conversion_results['ocr_used'].sum()
    
    print(f"\n=== Conversion Summary ===")
    print(f"Successful: {success_count}")
    print(f"Failed: {fail_count}")
    print(f"Used OCR: {ocr_count}")
    
    if fail_count > 0:
        print(f"\nFailed conversions:")
        display(conversion_results[~conversion_results['converted']][['ceqNumber', 'source_file', 'error']])
else:
    print("No files to convert. All files already converted or queue is empty.")

## Verify Conversions

In [None]:
def verify_conversions():
    """
    Compare expected conversions against existing output files.
    """
    doc_df_full = load_document_records()
    
    # Build expected output paths
    doc_df_full['year'] = doc_df_full['ceqNumber'].astype(str).str[:4]
    doc_df_full['localFilename'] = doc_df_full.apply(
        lambda row: build_local_filename(
            row['ceqNumber'], 
            row['name'] or row['fileNameForDownload'] or f"{row['attachmentId']}.pdf"
        ),
        axis=1
    )
    doc_df_full['sourcePath'] = doc_df_full.apply(
        lambda row: DOCUMENTS_DIR / row['year'] / row['localFilename'],
        axis=1
    )
    doc_df_full['outputPath'] = doc_df_full.apply(
        lambda row: OUTPUT_DIR / row['year'] / row['localFilename'].replace('.pdf', '.md').replace('.PDF', '.md'),
        axis=1
    )
    
    # Check existence
    doc_df_full['sourceExists'] = doc_df_full['sourcePath'].apply(lambda p: p.exists())
    doc_df_full['outputExists'] = doc_df_full['outputPath'].apply(lambda p: p.exists())
    
    # Only count documents where source exists
    with_source = doc_df_full[doc_df_full['sourceExists']]
    
    total = len(with_source)
    converted = with_source['outputExists'].sum()
    remaining = total - converted
    
    print(f"=== Conversion Verification ===")
    print(f"Total source documents: {total}")
    print(f"Converted: {converted} ({100*converted/total:.1f}%)")
    print(f"Remaining: {remaining} ({100*remaining/total:.1f}%)")
    
    print(f"\nBy year:")
    summary = with_source.groupby('year').agg(
        total=('outputExists', 'count'),
        converted=('outputExists', 'sum')
    )
    summary['remaining'] = summary['total'] - summary['converted']
    summary['pct_complete'] = (100 * summary['converted'] / summary['total']).round(1)
    display(summary)
    
    return doc_df_full

verification_df = verify_conversions()

## Retry Failed Conversions

In [None]:
def retry_failed_conversions():
    """
    Retry any previously failed conversions.
    """
    status_df = load_conversion_status()
    failed = status_df[~status_df['converted']]
    
    if len(failed) == 0:
        print("No failed conversions to retry.")
        return
    
    print(f"Retrying {len(failed)} failed conversions...")
    
    # Rebuild queue from failed records
    doc_df_full = load_document_records()
    retry_queue = doc_df_full[doc_df_full['attachmentId'].isin(failed['attachmentId'])].copy()
    
    # Add required columns
    retry_queue['year'] = retry_queue['ceqNumber'].astype(str).str[:4]
    retry_queue['localFilename'] = retry_queue.apply(
        lambda row: build_local_filename(
            row['ceqNumber'], 
            row['name'] or row['fileNameForDownload'] or f"{row['attachmentId']}.pdf"
        ),
        axis=1
    )
    retry_queue['sourcePath'] = retry_queue.apply(
        lambda row: DOCUMENTS_DIR / row['year'] / row['localFilename'],
        axis=1
    )
    retry_queue['outputPath'] = retry_queue.apply(
        lambda row: OUTPUT_DIR / row['year'] / row['localFilename'].replace('.pdf', '.md').replace('.PDF', '.md'),
        axis=1
    )
    
    # Run conversions
    retry_results = run_conversions(retry_queue)
    
    # Update status
    status_df = status_df[~status_df['attachmentId'].isin(retry_results['attachmentId'])]
    combined_status = pd.concat([status_df, retry_results], ignore_index=True)
    save_conversion_status(combined_status)
    
    success_count = retry_results['converted'].sum()
    print(f"Retry complete: {success_count}/{len(retry_results)} successful")

# Uncomment to retry failed conversions:
# retry_failed_conversions()

## Sample Output Inspection

In [None]:
# Inspect a sample converted file
def show_sample_output(n_chars: int = 2000):
    """
    Display the beginning of a sample converted markdown file.
    """
    md_files = list(OUTPUT_DIR.glob("**/*.md"))
    
    if not md_files:
        print("No converted files found yet.")
        return
    
    # Pick a random file
    import random
    sample_file = random.choice(md_files)
    
    print(f"Sample file: {sample_file}")
    print(f"File size: {sample_file.stat().st_size:,} bytes")
    print("=" * 60)
    
    with open(sample_file, 'r', encoding='utf-8') as f:
        content = f.read(n_chars)
    
    print(content)
    if len(content) == n_chars:
        print(f"\n... [truncated at {n_chars} chars]")

# Uncomment to see a sample:
# show_sample_output()