Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,17 @@ All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).

## [0.1.4] - 2026-02-27

### Added
- `src/docproc/ocr.py` — Async OCR extraction via DeepFellow easyOCR API
- Retry logic with exponential backoff (3 attempts, 1s initial delay, 2x factor)
- File validation for supported types (PDF, PNG, JPG, JPEG, TIFF)
- `ocr_endpoint` field in `DeepfellowConfig`
- `httpx` as explicit dependency for HTTP calls
- `pytest-asyncio` dev dependency with `asyncio_mode = "auto"`
- Test suite for OCR module (~20 tests)

## [0.1.3] - 2026-02-27

### Added
Expand Down
1 change: 1 addition & 0 deletions config-example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ directories:
deepfellow:
base_url: "http://localhost:8000"
responses_endpoint: "/v1/responses"
ocr_endpoint: "/v1/ocr"
api_key: "${DEEPFELLOW_API_KEY}"
vision_model: "gpt-4-vision"
llm_model: "deepseek"
Expand Down
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "docproc"
version = "0.1.3"
version = "0.1.4"
requires-python = ">=3.14"
dependencies = [
"watchdog>=4.0.0",
Expand All @@ -9,6 +9,7 @@ dependencies = [
"pyyaml>=6.0",
"gradio>=4.0.0",
"python-dotenv>=1.0.0",
"httpx>=0.28.0",
]

[build-system]
Expand All @@ -27,11 +28,13 @@ dev = [
"pytest-cov>=6.0.0",
"ruff>=0.11.0",
"ty>=0.0.1a0",
"pytest-asyncio>=0.25.0",
]

[tool.pytest.ini_options]
testpaths = ["tests"]
addopts = "--cov=docproc --cov-report=term-missing --cov-fail-under=80"
asyncio_mode = "auto"

[tool.ruff]
target-version = "py314"
Expand Down
2 changes: 1 addition & 1 deletion src/docproc/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.1.3"
__version__ = "0.1.4"
1 change: 1 addition & 0 deletions src/docproc/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class DeepfellowConfig(BaseModel):

base_url: str = Field(min_length=1)
responses_endpoint: str = Field(min_length=1)
ocr_endpoint: str = Field(min_length=1)
api_key: str
vision_model: str = Field(min_length=1)
llm_model: str = Field(min_length=1)
Expand Down
175 changes: 175 additions & 0 deletions src/docproc/ocr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
"""OCR extraction via DeepFellow easyOCR API.

Sends PDF/image files to the remote easyOCR endpoint and returns
structured text with page-level breakdown. Runs async for parallel
execution with Vision extraction.
"""

import asyncio
import logging
from pathlib import Path
from typing import Any

import httpx
from pydantic import ValidationError

from docproc.config import Config
from docproc.models import OCRResult, PageText

logger = logging.getLogger(__name__)

SUPPORTED_EXTENSIONS = frozenset({".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".tif"})

_MAX_RETRIES = 3
_INITIAL_DELAY = 1.0
_BACKOFF_FACTOR = 2.0
_TIMEOUT_SECONDS = 120.0


class OCRError(Exception):
"""Raised when OCR extraction fails."""


def _validate_file(file_path: Path) -> None:
"""Check that the file exists, is a regular file, and has a supported extension."""
if not file_path.is_file():
msg = f"File not found or not a regular file: {file_path}"
raise OCRError(msg)
ext = file_path.suffix.lower()
if ext not in SUPPORTED_EXTENSIONS:
msg = f"Unsupported file type: {ext}"
raise OCRError(msg)


def _build_url(config: Config) -> str:
"""Join base_url and ocr_endpoint into a full URL."""
base = config.deepfellow.base_url.rstrip("/")
endpoint = config.deepfellow.ocr_endpoint
if not endpoint.startswith("/"):
endpoint = "/" + endpoint
return base + endpoint


def _parse_response(data: dict[str, Any]) -> OCRResult:
"""Convert API JSON response to an OCRResult."""
if "pages" not in data:
keys = list(data.keys())
msg = f"Malformed OCR response: missing 'pages' key. Response keys: {keys}"
raise OCRError(msg)
try:
pages = [
PageText(page_number=p["page_number"], text=p["text"])
for p in data["pages"]
]
full_text = "\n\n".join(p.text for p in pages)
confidence = data.get("confidence")
return OCRResult(text=full_text, pages=pages, confidence=confidence)
except (KeyError, TypeError, ValidationError) as exc:
msg = f"Malformed OCR response: {exc}"
raise OCRError(msg) from exc


async def _send_with_retry(
client: httpx.AsyncClient,
url: str,
file_path: Path,
api_key: str,
) -> dict[str, Any]:
"""POST the file with exponential backoff retry on 5xx/timeouts."""
try:
file_bytes = file_path.read_bytes()
except OSError as exc:
msg = f"Failed to read file {file_path}: {exc}"
raise OCRError(msg) from exc

delay = _INITIAL_DELAY
last_error: Exception | None = None

for attempt in range(1, _MAX_RETRIES + 1):
try:
files = {"file": (file_path.name, file_bytes)}
headers = {"Authorization": f"Bearer {api_key}"}
response = await client.post(
url,
files=files,
headers=headers,
timeout=_TIMEOUT_SECONDS,
)

if response.status_code >= 500:
last_error = OCRError(
f"Server error {response.status_code}: {response.text}"
)
logger.warning(
"OCR attempt %d/%d for '%s' failed (HTTP %d): %s",
attempt,
_MAX_RETRIES,
file_path.name,
response.status_code,
response.text[:200],
)
if attempt < _MAX_RETRIES:
await asyncio.sleep(delay)
delay *= _BACKOFF_FACTOR
continue

if response.status_code >= 400:
msg = f"Client error {response.status_code}: {response.text}"
raise OCRError(msg)

try:
return response.json()
except ValueError as exc:
msg = (
f"OCR API returned non-JSON response "
f"(status {response.status_code}): {response.text[:200]}"
)
raise OCRError(msg) from exc

except httpx.TransportError as exc:
last_error = OCRError(f"Transport error: {exc}")
logger.warning(
"OCR attempt %d/%d for '%s' failed with transport error: %s",
attempt,
_MAX_RETRIES,
file_path.name,
exc,
)
if attempt < _MAX_RETRIES:
await asyncio.sleep(delay)
delay *= _BACKOFF_FACTOR

msg = f"OCR failed after {_MAX_RETRIES} attempts"
logger.error(
"OCR extraction failed for '%s' after %d attempts: %s",
file_path.name,
_MAX_RETRIES,
last_error,
)
raise OCRError(msg) from last_error


async def extract_text(file_path: Path, config: Config) -> OCRResult:
"""Extract text from a document using DeepFellow easyOCR.

Args:
file_path: Path to PDF or image file.
config: Application configuration.

Returns:
OCRResult with extracted text and page breakdown.

Raises:
OCRError: If extraction fails after retries.
"""
_validate_file(file_path)
url = _build_url(config)

logger.info("Starting OCR extraction: %s", file_path.name)

async with httpx.AsyncClient() as client:
data = await _send_with_retry(client, url, file_path, config.deepfellow.api_key)

result = _parse_response(data)
logger.info("OCR complete: %s (%d pages)", file_path.name, len(result.pages))
return result
1 change: 1 addition & 0 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
"deepfellow": {
"base_url": "http://localhost:8000",
"responses_endpoint": "/v1/responses",
"ocr_endpoint": "/v1/ocr",
"api_key": "test-key",
"vision_model": "gpt-4-vision",
"llm_model": "deepseek",
Expand Down
2 changes: 1 addition & 1 deletion tests/test_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@


def test_version_matches_expected():
assert __version__ == "0.1.3"
assert __version__ == "0.1.4"
Loading