# Convert EIS Documents to Text using PyMuPDF

This notebook converts PDF documents to text using `pymupdf` (fitz), which provides fast extraction suitable for large document corpora.

**Key Features:**
- Mirrors directory structure from `documents/` to `text_conversions/`
- Very fast: processes hundreds of pages per second
- Handles both digital and scanned PDFs (with OCR fallback if needed)
- Tracks conversion progress to allow resuming
- Parallel processing support

**Output format:** Plain text files (`.txt`) with page breaks indicated.

This replaces the text extraction previously done in `make_filter_text_tables.R`.

In [2]:
# Install required packages if needed
# !pip install pymupdf pandas pyarrow tqdm

In [3]:
import os
import re
import pandas as pd
import fitz  # pymupdf
from pathlib import Path
from tqdm.notebook import tqdm
import logging
from datetime import datetime

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [4]:
# Configuration
REPO_ROOT = Path("../").resolve()
METADATA_DIR = REPO_ROOT / "metadata"
DOCUMENTS_DIR = REPO_ROOT / "documents"  # Source PDFs (may be symlink to Box)
OUTPUT_DIR = REPO_ROOT / "text_conversions"  # Output text files

# Input metadata file
DOC_RECORD_FILE = METADATA_DIR / "eis_document_record_api.parquet"

# Failed files tracking (simple CSV - only tracks failures)
FAILED_FILES_LOG = METADATA_DIR / "text_conversion_failures.csv"

print(f"Repository root: {REPO_ROOT}")
print(f"Documents directory: {DOCUMENTS_DIR}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Failed files log: {FAILED_FILES_LOG}")

Repository root: /Users/admin-tascott/Documents/GitHub/eis_documents/enepa_repository
Documents directory: /Users/admin-tascott/Documents/GitHub/eis_documents/enepa_repository/documents
Output directory: /Users/admin-tascott/Documents/GitHub/eis_documents/enepa_repository/text_conversions
Failed files log: /Users/admin-tascott/Documents/GitHub/eis_documents/enepa_repository/metadata/text_conversion_failures.csv


## Helper Functions

In [6]:
def sanitize_filename(filename: str) -> str:
    """
    Sanitize filename to match existing convention.
    - Remove special characters: ( ) & , ~
    - Replace spaces with underscores
    - Normalize PDF extension
    """
    clean = re.sub(r'[()&,~\/]', '', filename)
    clean = re.sub(r'[\s_]+', '_', clean)
    clean = re.sub(r'\.PDF$', '.pdf', clean, flags=re.IGNORECASE)
    clean = re.sub(r'\.pdf\.pdf$', '.pdf', clean, flags=re.IGNORECASE)
    clean = clean.strip('_')
    return clean


def build_local_filename(ceq_number, original_filename: str) -> str:
    """
    Build the local filename following existing convention.
    Format: {CEQ_NUMBER}_{sanitized_filename}
    """
    sanitized = sanitize_filename(original_filename)
    return f"{ceq_number}_{sanitized}"


def get_year_from_ceq(ceq_number) -> str:
    """Extract year from CEQ Number (first 4 digits)."""
    return str(ceq_number)[:4]

In [7]:
import signal
import multiprocessing as mp
from functools import partial

class TimeoutError(Exception):
    """Custom timeout error for PDF processing."""
    pass


def _timeout_handler(signum, frame):
    raise TimeoutError("PDF processing timed out")


def extract_text_from_pdf(pdf_path: Path, max_pages: int = 5000, timeout_seconds: int = 120) -> tuple[str, int, bool]:
    """
    Extract text from a PDF using pymupdf with defensive error handling.
    
    Args:
        pdf_path: Path to the PDF file
        max_pages: Maximum pages to process (skip very large docs)
        timeout_seconds: Maximum time allowed for processing a single PDF
    
    Returns:
        Tuple of (extracted_text, num_pages, has_text)
    """
    doc = None
    
    # Set up timeout (Unix only - on Windows this is a no-op)
    old_handler = None
    if hasattr(signal, 'SIGALRM'):
        old_handler = signal.signal(signal.SIGALRM, _timeout_handler)
        signal.alarm(timeout_seconds)
    
    try:
        # Check file size first (skip files > 500MB)
        file_size = pdf_path.stat().st_size
        if file_size > 500 * 1024 * 1024:
            return f"SKIPPED: File too large ({file_size / 1024 / 1024:.0f} MB)", 0, False
        
        doc = fitz.open(pdf_path)
        num_pages = len(doc)
        
        if num_pages == 0:
            doc.close()
            return "", 0, False
        
        if num_pages > max_pages:
            doc.close()
            return f"SKIPPED: Too many pages ({num_pages})", num_pages, False
        
        text_parts = []
        total_chars = 0
        
        for page_num in range(num_pages):
            try:
                page = doc[page_num]
                page_text = page.get_text()
                total_chars += len(page_text.strip())
                
                # Add page marker and text
                text_parts.append(f"\n\n--- PAGE {page_num + 1} ---\n\n")
                text_parts.append(page_text)
            except Exception as page_error:
                # Skip problematic pages but continue
                text_parts.append(f"\n\n--- PAGE {page_num + 1} (ERROR: {str(page_error)[:100]}) ---\n\n")
        
        doc.close()
        doc = None
        
        full_text = "".join(text_parts)
        has_text = total_chars > (num_pages * 50)  # At least 50 chars per page average
        
        return full_text, num_pages, has_text
    
    except TimeoutError:
        if doc is not None:
            try:
                doc.close()
            except:
                pass
        return f"ERROR: Timeout after {timeout_seconds} seconds", 0, False
        
    except Exception as e:
        if doc is not None:
            try:
                doc.close()
            except:
                pass
        return f"ERROR: {str(e)[:200]}", 0, False
    
    finally:
        # Cancel timeout and restore old handler
        if hasattr(signal, 'SIGALRM'):
            signal.alarm(0)
            if old_handler is not None:
                signal.signal(signal.SIGALRM, old_handler)

In [8]:
def convert_single_pdf(args: tuple, timeout_seconds: int = 120) -> dict:
    """
    Convert a single PDF to text with robust error handling.
    
    Args:
        args: Tuple of (pdf_path, output_path, ceq_number, attachment_id)
        timeout_seconds: Maximum time allowed for processing
    
    Returns:
        Dict with conversion status
    """
    pdf_path, output_path, ceq_number, attachment_id = args
    pdf_path = Path(pdf_path)
    output_path = Path(output_path)
    
    try:
        # Check if source exists
        if not pdf_path.exists():
            return {
                "ceqNumber": ceq_number,
                "attachmentId": attachment_id,
                "source_file": str(pdf_path),
                "output_file": None,
                "converted": False,
                "num_pages": 0,
                "has_text": False,
                "error": "Source file not found",
                "timestamp": datetime.now().isoformat()
            }
        
        # Extract text with timeout
        text, num_pages, has_text = extract_text_from_pdf(pdf_path, timeout_seconds=timeout_seconds)
        
        # Check for errors or skipped files
        if text.startswith("ERROR:") or text.startswith("SKIPPED:"):
            return {
                "ceqNumber": ceq_number,
                "attachmentId": attachment_id,
                "source_file": str(pdf_path),
                "output_file": None,
                "converted": False,
                "num_pages": num_pages,
                "has_text": False,
                "error": text,
                "timestamp": datetime.now().isoformat()
            }
        
        # Ensure output directory exists
        output_path.parent.mkdir(parents=True, exist_ok=True)
        
        # Write text file
        with open(output_path, 'w', encoding='utf-8', errors='replace') as f:
            f.write(text)
        
        return {
            "ceqNumber": ceq_number,
            "attachmentId": attachment_id,
            "source_file": str(pdf_path),
            "output_file": str(output_path),
            "converted": True,
            "num_pages": num_pages,
            "has_text": has_text,
            "error": None,
            "timestamp": datetime.now().isoformat()
        }
        
    except Exception as e:
        return {
            "ceqNumber": ceq_number,
            "attachmentId": attachment_id,
            "source_file": str(pdf_path),
            "output_file": None,
            "converted": False,
            "num_pages": 0,
            "has_text": False,
            "error": f"CONVERT_ERROR: {str(e)[:400]}",
            "timestamp": datetime.now().isoformat()
        }

## Load Document Records

In [10]:
def load_document_records():
    """Load document records from the API metadata."""
    if DOC_RECORD_FILE.exists():
        return pd.read_parquet(DOC_RECORD_FILE)
    else:
        raise FileNotFoundError(
            f"Document records not found at {DOC_RECORD_FILE}.\n"
            f"Run fetch_eis_records_api.ipynb first."
        )


def load_failed_files() -> set:
    """Load set of attachment IDs that previously failed."""
    if FAILED_FILES_LOG.exists():
        df = pd.read_csv(FAILED_FILES_LOG)
        return set(df['attachmentId'].astype(str))
    return set()


def append_failed_file(attachment_id, ceq_number, source_file, error):
    """Append a failed file to the log."""
    file_exists = FAILED_FILES_LOG.exists()
    with open(FAILED_FILES_LOG, 'a', encoding='utf-8') as f:
        if not file_exists:
            f.write("attachmentId,ceqNumber,source_file,error,timestamp\n")
        # Escape commas and quotes in error message
        error_clean = str(error).replace('"', "'").replace('\n', ' ')[:200]
        f.write(f'{attachment_id},{ceq_number},"{source_file}","{error_clean}",{datetime.now().isoformat()}\n')

In [11]:
# Load document records
doc_df = load_document_records()
print(f"Loaded {len(doc_df)} document records")

# Add helper columns
doc_df['year'] = doc_df['ceqNumber'].astype(str).str[:4]
doc_df['localFilename'] = doc_df.apply(
    lambda row: build_local_filename(
        row['ceqNumber'], 
        row['name'] or row['fileNameForDownload'] or f"{row['attachmentId']}.pdf"
    ),
    axis=1
)

# Build source and output paths
doc_df['sourcePath'] = doc_df.apply(
    lambda row: DOCUMENTS_DIR / row['year'] / row['localFilename'],
    axis=1
)
doc_df['outputPath'] = doc_df.apply(
    lambda row: OUTPUT_DIR / row['year'] / (row['localFilename'].replace('.pdf', '.txt').replace('.PDF', '.txt')),
    axis=1
)

display(doc_df[['ceqNumber', 'year', 'localFilename', 'sourcePath', 'outputPath']].head())

Loaded 45704 document records


Unnamed: 0,ceqNumber,year,localFilename,sourcePath,outputPath
0,20250186,2025,20250186_LoMo_FRR_Comprehensive_Study_Draft_Re...,/Users/admin-tascott/Documents/GitHub/eis_docu...,/Users/admin-tascott/Documents/GitHub/eis_docu...
1,20250186,2025,20250186_LoMo_System_Plan_-_Basis_of_Estimate.pdf,/Users/admin-tascott/Documents/GitHub/eis_docu...,/Users/admin-tascott/Documents/GitHub/eis_docu...
2,20250186,2025,20250186_Appendix_A.1_LoMo_FRM_Past_Performanc...,/Users/admin-tascott/Documents/GitHub/eis_docu...,/Users/admin-tascott/Documents/GitHub/eis_docu...
3,20250186,2025,20250186_Appendix_A.2.1_LoMo_RAS_Calibration_O...,/Users/admin-tascott/Documents/GitHub/eis_docu...,/Users/admin-tascott/Documents/GitHub/eis_docu...
4,20250186,2025,20250186_Appendix_A.2.2_LoMo_RAS_Calibration_K...,/Users/admin-tascott/Documents/GitHub/eis_docu...,/Users/admin-tascott/Documents/GitHub/eis_docu...


In [12]:
# Check which source files exist
doc_df['sourceExists'] = doc_df['sourcePath'].apply(lambda p: p.exists())
print(f"Source files found: {doc_df['sourceExists'].sum()} / {len(doc_df)}")

# Documents with source files available for conversion
to_convert = doc_df[doc_df['sourceExists']].copy()
print(f"Documents available for conversion: {len(to_convert)}")

print(f"\nBy year:")
print(to_convert['year'].value_counts().sort_index())

Source files found: 45559 / 45704
Documents available for conversion: 45559

By year:
year
1987       2
1988       1
1990       2
1991       4
1992       3
1993       3
1994      10
1995      20
1996      59
1997      82
1998      58
1999     202
2000     369
2001     481
2002     596
2003     562
2004     631
2005     755
2006     712
2007     912
2008     911
2009     894
2010     661
2011     406
2012     991
2013    3007
2014    2918
2015    3991
2016    3651
2017    2697
2018    3239
2019    2865
2020    3280
2021    2309
2022    2206
2023    1633
2024    2498
2025    1938
Name: count, dtype: int64


## Conversion Settings

In [14]:
# ============================================
# CONVERSION SETTINGS - MODIFY AS NEEDED
# ============================================

# Filter by year (set to None to convert all years)
YEAR_FILTER = None
YEAR_FILTER = [2019]  # Example: only recent years

# Maximum number of files to convert (set to None for all)
MAX_CONVERSIONS = None
# MAX_CONVERSIONS = 100  # Example: test with 100 files

# Set to True to retry previously failed files
RETRY_FAILURES = True

# Timeout per PDF in seconds
# Files hanging longer than this are killed (likely Box sync issues)
# 30 seconds is plenty for normal PDFs - increase only for very large local files
PDF_TIMEOUT_SECONDS = 30

print(f"Settings:")
print(f"  YEAR_FILTER: {YEAR_FILTER}")
print(f"  MAX_CONVERSIONS: {MAX_CONVERSIONS}")
print(f"  RETRY_FAILURES: {RETRY_FAILURES}")
print(f"  PDF_TIMEOUT_SECONDS: {PDF_TIMEOUT_SECONDS}")

Settings:
  YEAR_FILTER: [2019]
  MAX_CONVERSIONS: None
  RETRY_FAILURES: True
  PDF_TIMEOUT_SECONDS: 30


In [15]:
# Build conversion queue
conversion_queue = to_convert.copy()

# Apply year filter
if YEAR_FILTER:
    year_filter_str = [str(y) for y in YEAR_FILTER]
    conversion_queue = conversion_queue[conversion_queue['year'].isin(year_filter_str)]
    print(f"Filtered to years {YEAR_FILTER}: {len(conversion_queue)} documents")

# Skip files that already have .txt output
conversion_queue['outputExists'] = conversion_queue['outputPath'].apply(lambda p: p.exists())
existing_count = conversion_queue['outputExists'].sum()
print(f"Already converted (txt exists): {existing_count}")
conversion_queue = conversion_queue[~conversion_queue['outputExists']]

# Skip previously failed files (unless RETRY_FAILURES is True)
if not RETRY_FAILURES:
    failed_ids = load_failed_files()
    if failed_ids:
        conversion_queue['previouslyFailed'] = conversion_queue['attachmentId'].astype(str).isin(failed_ids)
        failed_count = conversion_queue['previouslyFailed'].sum()
        print(f"Previously failed (skipping): {failed_count}")
        conversion_queue = conversion_queue[~conversion_queue['previouslyFailed']]

# Apply max conversions limit
if MAX_CONVERSIONS and len(conversion_queue) > MAX_CONVERSIONS:
    conversion_queue = conversion_queue.head(MAX_CONVERSIONS)
    print(f"Limited to {MAX_CONVERSIONS} conversions")

print(f"\nFinal conversion queue: {len(conversion_queue)} files")

Filtered to years [2019]: 2865 documents
Already converted (txt exists): 0

Final conversion queue: 2865 files


## Create Output Directory Structure

In [17]:
# Create output directory structure mirroring documents/
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Get all year directories from documents
year_dirs = [d for d in DOCUMENTS_DIR.iterdir() if d.is_dir() and d.name.isdigit()]
print(f"Found {len(year_dirs)} year directories in documents/")

# Create corresponding directories in marker_conversions/
for year_dir in year_dirs:
    output_year_dir = OUTPUT_DIR / year_dir.name
    output_year_dir.mkdir(exist_ok=True)

print(f"Created directory structure in {OUTPUT_DIR}")

Found 38 year directories in documents/
Created directory structure in /Users/admin-tascott/Documents/GitHub/eis_documents/enepa_repository/text_conversions


## Run Conversions

In [19]:
import multiprocessing as mp

# Force fork context on macOS (default is spawn which can't pickle notebook functions)
_mp_ctx = mp.get_context('fork')

def _convert_in_process(args: tuple, result_queue):
    """Worker function that runs in a separate process and puts result in queue."""
    pdf_path, output_path, ceq_number, attachment_id = args
    pdf_path = Path(pdf_path)
    output_path = Path(output_path)
    
    try:
        if not pdf_path.exists():
            result_queue.put({"converted": False, "num_pages": 0, "error": "Source file not found"})
            return
        
        file_size = pdf_path.stat().st_size
        if file_size > 500 * 1024 * 1024:
            result_queue.put({"converted": False, "num_pages": 0, "error": f"SKIPPED: File too large ({file_size // 1024 // 1024} MB)"})
            return
        
        doc = fitz.open(pdf_path)
        num_pages = len(doc)
        
        if num_pages == 0:
            doc.close()
            result_queue.put({"converted": False, "num_pages": 0, "error": "Empty PDF"})
            return
        
        if num_pages > 5000:
            doc.close()
            result_queue.put({"converted": False, "num_pages": num_pages, "error": f"SKIPPED: Too many pages ({num_pages})"})
            return
        
        text_parts = []
        for page_num in range(num_pages):
            try:
                page = doc[page_num]
                page_text = page.get_text()
                text_parts.append(f"\n\n--- PAGE {page_num + 1} ---\n\n")
                text_parts.append(page_text)
            except:
                text_parts.append(f"\n\n--- PAGE {page_num + 1} (ERROR) ---\n\n")
        
        doc.close()
        
        output_path.parent.mkdir(parents=True, exist_ok=True)
        with open(output_path, 'w', encoding='utf-8', errors='replace') as f:
            f.write("".join(text_parts))
        
        result_queue.put({"converted": True, "num_pages": num_pages, "error": None})
        
    except Exception as e:
        result_queue.put({"converted": False, "num_pages": 0, "error": str(e)[:200]})


def convert_with_timeout(args: tuple, timeout_seconds: int) -> dict:
    """Run conversion in a subprocess that can be killed on timeout."""
    result_queue = _mp_ctx.Queue()
    proc = _mp_ctx.Process(target=_convert_in_process, args=(args, result_queue))
    proc.start()
    proc.join(timeout=timeout_seconds)
    
    if proc.is_alive():
        proc.terminate()
        proc.join(timeout=2)
        if proc.is_alive():
            proc.kill()
            proc.join()
        return {"converted": False, "num_pages": 0, "error": f"TIMEOUT after {timeout_seconds}s (Box sync issue?)"}
    
    try:
        return result_queue.get_nowait()
    except:
        return {"converted": False, "num_pages": 0, "error": "Process ended without result"}


def run_conversions(queue: pd.DataFrame, timeout_seconds: int = 30) -> dict:
    """Run text extraction with hard timeouts that kill hung processes."""
    success_count = 0
    fail_count = 0
    total_pages = 0
    
    pbar = tqdm(queue.iterrows(), total=len(queue), desc="Converting")
    
    for idx, row in pbar:
        filename = Path(row['sourcePath']).name
        short_name = filename[:40] + "..." if len(filename) > 40 else filename
        pbar.set_postfix_str(short_name)
        
        args = (str(row['sourcePath']), str(row['outputPath']), row['ceqNumber'], row['attachmentId'])
        
        try:
            result = convert_with_timeout(args, timeout_seconds)
            
            if result.get('converted'):
                success_count += 1
                total_pages += result.get('num_pages', 0)
            else:
                fail_count += 1
                append_failed_file(
                    row['attachmentId'],
                    row['ceqNumber'],
                    str(row['sourcePath']),
                    result.get('error', 'Unknown error')
                )
                logger.warning(f"Failed: {short_name} - {result.get('error', '')[:60]}")
                
        except Exception as e:
            fail_count += 1
            append_failed_file(
                row['attachmentId'],
                row['ceqNumber'],
                str(row['sourcePath']),
                f"EXCEPTION: {str(e)[:200]}"
            )
            logger.error(f"Exception: {short_name} - {str(e)[:60]}")
    
    return {
        'success': success_count,
        'failed': fail_count,
        'total_pages': total_pages
    }

In [20]:
# Run the conversions
if len(conversion_queue) > 0:
    print(f"Starting conversion of {len(conversion_queue)} files...")
    print(f"Timeout per file: {PDF_TIMEOUT_SECONDS} seconds")
    print(f"Failures logged to: {FAILED_FILES_LOG}")
    print()
    
    import time
    start_time = time.time()
    
    results = run_conversions(conversion_queue, timeout_seconds=PDF_TIMEOUT_SECONDS)
    
    elapsed = time.time() - start_time
    
    print(f"\n=== Conversion Summary ===")
    print(f"Time elapsed: {elapsed/60:.1f} minutes")
    print(f"Successful: {results['success']}")
    print(f"Failed: {results['failed']}")
    print(f"Total pages: {results['total_pages']:,}")
    if elapsed > 0:
        print(f"Rate: {(results['success'] + results['failed'])/elapsed:.1f} docs/sec")
    
    if results['failed'] > 0:
        print(f"\nSee {FAILED_FILES_LOG} for failure details")
else:
    print("No files to convert.")

Starting conversion of 2865 files...
Timeout per file: 30 seconds
Failures logged to: /Users/admin-tascott/Documents/GitHub/eis_documents/enepa_repository/metadata/text_conversion_failures.csv



Converting:   0%|          | 0/2865 [00:00<?, ?it/s]

MuPDF error: format error: cannot find page 1333 in page tree

MuPDF error: format error: cannot find page 1344 in page tree

MuPDF error: format error: cannot find page 1350 in page tree

MuPDF error: format error: cannot find page 1350 in page tree

MuPDF error: format error: cannot find page 1356 in page tree

MuPDF error: format error: cannot find page 1360 in page tree

MuPDF error: format error: cannot find page 1379 in page tree

MuPDF error: format error: cannot find page 1381 in page tree

MuPDF error: format error: cannot find page 1389 in page tree

MuPDF error: format error: cannot find page 1392 in page tree

MuPDF error: format error: cannot find page 1394 in page tree

MuPDF error: format error: cannot find page 1394 in page tree

MuPDF error: format error: cannot find page 1406 in page tree

MuPDF error: format error: cannot find page 1510 in page tree

MuPDF error: format error: cannot find page 1510 in page tree

MuPDF error: format error: cannot find page 1511 in pag



MuPDF error: syntax error: unknown keyword: 'findresource'

MuPDF error: syntax error: unknown keyword: 'begin'

MuPDF error: syntax error: unknown keyword: 'dict'

MuPDF error: syntax error: unknown keyword: 'begin'

MuPDF error: syntax error: unknown keyword: 'begincmap'

MuPDF error: syntax error: unknown keyword: 'def'

MuPDF error: syntax error: unknown keyword: 'def'

MuPDF error: syntax error: unknown keyword: 'def'

MuPDF error: syntax error: unknown keyword: 'begincodespacerange'

MuPDF error: syntax error: unknown keyword: 'endcodespacerange'

MuPDF error: syntax error: unknown keyword: 'beginbfchar'

MuPDF error: syntax error: unknown keyword: 'endbfchar'

MuPDF error: syntax error: unknown keyword: 'endcmap'

MuPDF error: syntax error: unknown keyword: 'CMapName'

MuPDF error: syntax error: unknown keyword: 'currentdict'

MuPDF error: syntax error: unknown keyword: 'defineresource'

MuPDF error: syntax error: unknown keyword: 'pop'

MuPDF error: syntax error: unknown keywor



MuPDF error: format error: object is not a stream

MuPDF error: library error: zlib error: invalid literal/lengths set

MuPDF error: syntax error: cannot find ExtGState resource 'XGS1'

MuPDF error: syntax error: cannot find ExtGState resource 'XGS1'

MuPDF error: syntax error: cannot find ExtGState resource 'XGS1'

MuPDF error: syntax error: cannot find ExtGState resource 'XGS1'

MuPDF error: syntax error: cannot find ExtGState resource 'XGS1'

MuPDF error: syntax error: cannot find ExtGState resource 'XGS1'

MuPDF error: syntax error: cannot find ExtGState resource 'XGS1'

MuPDF error: syntax error: cannot find ExtGState resource 'XGS1'

MuPDF error: syntax error: cannot find ExtGState resource 'XGS1'

MuPDF error: syntax error: cannot find ExtGState resource 'XGS1'

MuPDF error: syntax error: cannot find ExtGState resource 'XGS1'

MuPDF error: syntax error: cannot find ExtGState resource 'XGS1'

MuPDF error: syntax error: cannot find ExtGState resource 'XGS1'

MuPDF error: syntax er

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)




MuPDF error: syntax error: cannot find ExtGState resource 'XGS1'

MuPDF error: syntax error: cannot find ExtGState resource 'XGS1'

MuPDF error: syntax error: cannot find ExtGState resource 'XGS1'

MuPDF error: syntax error: cannot find ExtGState resource 'XGS1'

MuPDF error: syntax error: cannot find ExtGState resource 'XGS1'

MuPDF error: syntax error: cannot find ExtGState resource 'XGS1'

MuPDF error: syntax error: cannot find ExtGState resource 'XGS1'

MuPDF error: syntax error: cannot find ExtGState resource 'XGS1'

MuPDF error: syntax error: cannot find ExtGState resource 'XGS1'

MuPDF error: syntax error: cannot find ExtGState resource 'XGS1'

MuPDF error: syntax error: cannot find ExtGState resource 'XGS1'

MuPDF error: syntax error: cannot find ExtGState resource 'XGS1'

MuPDF error: syntax error: cannot find ExtGState resource 'XGS1'

MuPDF error: syntax error: cannot find ExtGState resource 'XGS1'

MuPDF error: syntax error: cannot find ExtGState resource 'XGS1'

MuPDF err



MuPDF error: syntax error: cannot find ExtGState resource 'GS0'

MuPDF error: syntax error: cannot find ExtGState resource 'GS0'

MuPDF error: syntax error: cannot find ExtGState resource 'GS0'

MuPDF error: syntax error: cannot find ExtGState resource 'GS0'

MuPDF error: syntax error: cannot find ExtGState resource 'GS0'

MuPDF error: syntax error: cannot find ExtGState resource 'GS0'

MuPDF error: syntax error: cannot find ExtGState resource 'GS0'

MuPDF error: syntax error: cannot find ExtGState resource 'GS0'

MuPDF error: syntax error: cannot find ExtGState resource 'GS0'

MuPDF error: syntax error: cannot find ExtGState resource 'GS0'


=== Conversion Summary ===
Time elapsed: 403.4 minutes
Successful: 2856
Failed: 9
Total pages: 434,840
Rate: 0.1 docs/sec

See /Users/admin-tascott/Documents/GitHub/eis_documents/enepa_repository/metadata/text_conversion_failures.csv for failure details


In [21]:
conversion_results['source_file'][0]

NameError: name 'conversion_results' is not defined

## Verify Conversions

In [None]:
def verify_conversions():
    """Quick verification of conversion status."""
    doc_df = load_document_records()
    
    # Build paths
    doc_df['year'] = doc_df['ceqNumber'].astype(str).str[:4]
    doc_df['localFilename'] = doc_df.apply(
        lambda row: build_local_filename(
            row['ceqNumber'], 
            row['name'] or row['fileNameForDownload'] or f"{row['attachmentId']}.pdf"
        ),
        axis=1
    )
    doc_df['sourcePath'] = doc_df.apply(
        lambda row: DOCUMENTS_DIR / row['year'] / row['localFilename'],
        axis=1
    )
    doc_df['outputPath'] = doc_df.apply(
        lambda row: OUTPUT_DIR / row['year'] / (row['localFilename'].replace('.pdf', '.txt').replace('.PDF', '.txt')),
        axis=1
    )
    
    # Check status
    doc_df['sourceExists'] = doc_df['sourcePath'].apply(lambda p: p.exists())
    doc_df['outputExists'] = doc_df['outputPath'].apply(lambda p: p.exists())
    
    with_source = doc_df[doc_df['sourceExists']]
    total = len(with_source)
    converted = with_source['outputExists'].sum()
    
    # Count failures
    failed_ids = load_failed_files()
    
    print(f"=== Conversion Status ===")
    print(f"Source PDFs: {total}")
    print(f"Converted: {converted} ({100*converted/total:.1f}%)")
    print(f"Failed: {len(failed_ids)}")
    print(f"Remaining: {total - converted - len(failed_ids)}")

verify_conversions()

## Retry Failed Conversions

In [None]:
# View failed files
def show_failures():
    if FAILED_FILES_LOG.exists():
        df = pd.read_csv(FAILED_FILES_LOG)
        print(f"Total failures: {len(df)}")
        display(df)
    else:
        print("No failures logged.")

# Uncomment to view failures:
# show_failures()

# To retry failures, set RETRY_FAILURES = True in cell 13 and re-run
# To clear failure log and start fresh: 
# FAILED_FILES_LOG.unlink()

## Sample Output Inspection

In [None]:
# Inspect a sample converted file
def show_sample_output(n_chars: int = 2000):
    """Display the beginning of a random converted text file."""
    txt_files = list(OUTPUT_DIR.glob("**/*.txt"))
    
    if not txt_files:
        print("No converted files found.")
        return
    
    import random
    sample_file = random.choice(txt_files)
    
    print(f"File: {sample_file.name}")
    print(f"Size: {sample_file.stat().st_size:,} bytes")
    print("=" * 60)
    
    with open(sample_file, 'r', encoding='utf-8') as f:
        print(f.read(n_chars))

# Uncomment to see a sample:
# show_sample_output()