# Minimal OCR Parser with GPU and Multiprocessing

This notebook provides a minimal OCR parser that runs completely offline using Docling with GPU acceleration and multiprocessing support.


In [None]:
# Setup offline mode and imports
import os
import time
import multiprocessing as mp
from pathlib import Path
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import EasyOcrOptions, PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

# Setup offline mode
os.environ["DOCLING_ARTIFACTS_PATH"] = os.path.expanduser("~/.cache/docling/models")
os.environ["HF_HUB_OFFLINE"] = "1"
os.environ["TRANSFORMERS_OFFLINE"] = "1"

print("✅ Environment setup complete - Running in offline mode")


In [None]:
def process_pdf(pdf_path):
    """Process PDF with OCR on GPU"""
    print(f"🔍 OCR processing {Path(pdf_path).name}...")
    start = time.time()
    
    # OCR Pipeline with GPU
    pipeline = PdfPipelineOptions(
        artifacts_path=os.environ["DOCLING_ARTIFACTS_PATH"],
        enable_remote_services=False, 
        do_table_structure=True, 
        do_ocr=True, 
        do_chunking=True
    )
    pipeline.ocr_options = EasyOcrOptions(use_gpu=True, lang=['en'])
    converter = DocumentConverter({InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline)})
    result = converter.convert(pdf_path)
    
    doc = result.document
    text = doc.export_to_text()
    markdown = doc.export_to_markdown()
    
    # Save files
    os.makedirs("output", exist_ok=True)
    base = Path(pdf_path).stem
    with open(f"output/{base}_ocr.txt", "w") as f: 
        f.write(text)
    with open(f"output/{base}_ocr.md", "w") as f: 
        f.write(markdown)
    
    # Stats
    stats = {
        "time": time.time() - start,
        "text": len(text),
        "tables": len(doc.tables) if hasattr(doc, 'tables') else 0,
        "images": len(doc.pictures) if hasattr(doc, 'pictures') else 0,
        "pages": len(doc.pages) if hasattr(doc, 'pages') else 0
    }
    
    print(f"✅ OCR: {stats['time']:.1f}s, {stats['text']} chars, {stats['tables']} tables, {stats['images']} images")
    return stats


In [None]:
# Configuration
pdf_file = "companies_house_document.pdf"  # Change this to your PDF file
processes = 2  # Number of processes for multiprocessing

print("🚀 MINIMAL OCR PARSER (GPU + MULTIPROCESSING)")
print(f"📄 File: {pdf_file} | ⚡ Processes: {processes}")
print("=" * 50)


In [None]:
# Run OCR processing
start_time = time.time()
print(f"⏰ START TIME: {time.strftime('%H:%M:%S', time.localtime(start_time))}")

if processes > 1:
    with mp.Pool(processes=processes) as pool:
        results = pool.map(process_pdf, [pdf_file] * processes)
else:
    results = [process_pdf(pdf_file)]

end_time = time.time()
print(f"⏰ END TIME: {time.strftime('%H:%M:%S', time.localtime(end_time))}")
print(f"⏱️  TOTAL TIME: {end_time - start_time:.1f}s")
print("📁 Output: output/")
print(f"📊 Processed {len(results)} times")


In [None]:
# Display results summary
print("\n📊 DETAILED RESULTS:")
for i, result in enumerate(results, 1):
    print(f"Run {i}: {result['time']:.1f}s | Text: {result['text']} chars | Tables: {result['tables']} | Images: {result['images']} | Pages: {result['pages']}")

if len(results) > 1:
    avg_time = sum(r['time'] for r in results) / len(results)
    print(f"\n📈 Average processing time: {avg_time:.1f}s")


In [None]:
# Check output files
output_dir = Path("output")
if output_dir.exists():
    print("📁 Generated files:")
    for file in output_dir.glob("*"):
        size = file.stat().st_size
        print(f"  - {file.name} ({size:,} bytes)")
else:
    print("❌ No output directory found")


## Usage Instructions

1. **Change the PDF file**: Modify the `pdf_file` variable in the configuration cell
2. **Adjust processes**: Change the `processes` variable to control multiprocessing
3. **Run all cells**: Execute all cells in order
4. **Check output**: Results are saved in the `output/` directory

## Features

- ✅ **Completely offline** - No internet connection required
- ✅ **GPU acceleration** - Uses CUDA for faster OCR processing
- ✅ **Multiprocessing** - Parallel processing for multiple runs
- ✅ **Table extraction** - Extracts tables from PDFs
- ✅ **Image detection** - Identifies images in documents
- ✅ **Multiple formats** - Outputs both text and markdown
- ✅ **Timing information** - Shows start/end times and duration
