<a href="https://www.kaggle.com/code/trungcnguyn/ocr-run-testing?scriptVersionId=286084074" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
#!pip install -r /kaggle/input/venv-requirement/requirements.txt
!pip install paddleocr paddlepaddle-gpu paddlepaddle 

Collecting paddleocr
  Downloading paddleocr-3.3.2-py3-none-any.whl.metadata (55 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m55.2/55.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting paddlepaddle-gpu
  Downloading paddlepaddle_gpu-2.6.2-cp311-cp311-manylinux1_x86_64.whl.metadata (8.6 kB)
Collecting paddlepaddle
  Downloading paddlepaddle-3.2.2-cp311-cp311-manylinux1_x86_64.whl.metadata (8.8 kB)
Collecting paddlex<3.4.0,>=3.3.0 (from paddlex[ocr-core]<3.4.0,>=3.3.0->paddleocr)
  Downloading paddlex-3.3.11-py3-none-any.whl.metadata (79 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m79.9/79.9 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Collecting astor (from paddlepaddle-gpu)
  Downloading astor-0.8.1-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting opt-einsum==3.3.0 (f

In [11]:
import json
from pathlib import Path
from typing import Dict, List, Optional


class DocLayNetFinancialLoader:
    """
    Loader for DocLayNet (Kaggle) with financial_reports filtering.

    Dataset structure:
      DocLayNet_core/
        ‚îú‚îÄ‚îÄ COCO/train.json
        ‚îú‚îÄ‚îÄ PNG/*.png
    """

    def __init__(
        self,
        base_path: str = "/kaggle/input/doclaynet",
        split: str = "train",
        max_docs: int = 1000,
        cache_name: Optional[str] = None,
    ):
        self.base_path = Path(base_path)
        self.split = split
        self.max_docs = max_docs

        if cache_name is None:
            cache_name = f"doclaynet_financial_{max_docs}.json"

        self.cache_path = Path("/kaggle/working") / cache_name

        self.coco_json_path = (
            self.base_path / "DocLayNet_core" / "COCO" / f"{split}.json"
        )
        self.image_dir = self.base_path / "DocLayNet_core" / "PNG"

        self.data: Optional[Dict] = None

    # --------------------------------------------------
    # Public API
    # --------------------------------------------------
    def load(self) -> Dict:
        if self.cache_path.exists():
            print(f"[DocLayNet] Loading cached subset: {self.cache_path}")
            self.data = self._load_json(self.cache_path)
        else:
            print("[DocLayNet] Cache not found ‚Äî creating subset...")
            self.data = self._build_and_cache_subset()

        return self.data

    def get_image_paths(self) -> List[Path]:
        if self.data is None:
            raise RuntimeError("Call load() first.")

        return [
            self.image_dir / img["file_name"]
            for img in self.data["images"]
        ]

    # --------------------------------------------------
    # Internal
    # --------------------------------------------------
    def _build_and_cache_subset(self) -> Dict:
        coco = self._load_json(self.coco_json_path)

        # 1Ô∏è‚É£ Filter financial report pages
        financial_images = [
            img for img in coco["images"]
            if img.get("doc_category") == "financial_reports"
        ]

        if not financial_images:
            raise ValueError("No financial_reports images found!")

        # 2Ô∏è‚É£ Limit dataset size
        financial_images = financial_images[: self.max_docs]
        image_ids = {img["id"] for img in financial_images}

        # 3Ô∏è‚É£ Filter annotations
        financial_annotations = [
            ann for ann in coco["annotations"]
            if ann["image_id"] in image_ids
        ]

        subset = {
            "images": financial_images,
            "annotations": financial_annotations,
            "categories": coco["categories"],
        }

        # 4Ô∏è‚É£ Cache
        self._save_json(subset, self.cache_path)

        print(
            f"[DocLayNet] Saved {len(financial_images)} images and "
            f"{len(financial_annotations)} annotations"
        )

        return subset

    # --------------------------------------------------
    @staticmethod
    def _load_json(path: Path) -> Dict:
        with open(path, "r", encoding="utf-8") as f:
            return json.load(f)

    @staticmethod
    def _save_json(data: Dict, path: Path):
        with open(path, "w", encoding="utf-8") as f:
            json.dump(data, f)


In [15]:
loader = DocLayNetFinancialLoader(
    base_path="/kaggle/input/doclaynet",
    split="train",
    max_docs=1000,
)

coco_subset = loader.load()


# CHECKING 
print("Images:", len(coco_subset["images"]))
print("Annotations:", len(coco_subset["annotations"]))

# Check category correctness
print(set(img["doc_category"] for img in coco_subset["images"]))

# Check image path exists
path = loader.get_image_paths()[0]
print(path, path.exists())

[DocLayNet] Loading cached subset: /kaggle/working/doclaynet_financial_1000.json
Images: 1000
Annotations: 12607
{'financial_reports'}
/kaggle/input/doclaynet/DocLayNet_core/PNG/c6effb847ae7e4a80431696984fa90c98bb08c266481b9a03842422459c43bdd.png True


# 1 Testing table recognition

In [20]:
# Find Table category id
table_cat_id = next(
    c["id"] for c in coco_subset["categories"]
    if c["name"] == "Table"
)
# Image IDs that have at least one table annotation
table_image_ids = {
    ann["image_id"]
    for ann in coco_subset["annotations"]
    if ann["category_id"] == table_cat_id
}

len(table_image_ids)
table_images = [
    img for img in coco_subset["images"]
    if img["id"] in table_image_ids
]

len(table_images)
from pathlib import Path

BASE_PATH = Path("/kaggle/input/doclaynet/DocLayNet_core")
IMG_DIR = BASE_PATH / "PNG"

sample_img = table_images[0]
img_path = IMG_DIR / sample_img["file_name"]

print(img_path)
print(img_path.exists())


/kaggle/input/doclaynet/DocLayNet_core/PNG/c6effb847ae7e4a80431696984fa90c98bb08c266481b9a03842422459c43bdd.png
True


In [23]:
!pip install "paddlex[ocr]==<PADDLEX_VERSION>"

[31mERROR: Invalid requirement: 'paddlex[ocr]==<PADDLEX_VERSION>': Expected end or semicolon (after name and no valid version specifier)
    paddlex[ocr]==<PADDLEX_VERSION>
                ^[0m[31m
[0m

In [21]:
from paddleocr import PPStructureV3

pipeline = PPStructureV3(
    lang="en",                     # DocLayNet is English
    use_region_detection=True,
    use_table_recognition=True,
    use_chart_recognition=False,
    use_formula_recognition=False,
    use_seal_recognition=False,
    use_doc_orientation_classify=False,
    use_doc_unwarping=False,
    use_textline_orientation=False,
    device="cpu",                  # switch to "gpu:0" later
)


[33mChecking connectivity to the model hosters, this may take a while. To bypass this check, set `DISABLE_MODEL_SOURCE_CHECK` to `True`.[0m


RuntimeError: A dependency error occurred during pipeline creation. Please refer to the installation documentation to ensure all required dependencies are installed.

# 2 previous code version

In [None]:
import os
from datasets import load_dataset, load_from_disk

SAVE_DIR = "doclaynet_finance_test"

def get_finance_test_dataset():
    # 1Ô∏è‚É£ If dataset already saved locally ‚Üí load from disk
    if os.path.exists(SAVE_DIR):
        print(f"üìÇ Found existing dataset at: {SAVE_DIR}")
        ds = load_from_disk(SAVE_DIR)
        print("Loaded finance subset:", len(ds))
        return ds

    # 2Ô∏è‚É£ Otherwise ‚Üí download full test split and filter
    print("‚¨áÔ∏è Local dataset not found ‚Äî downloading DocLayNet test split...")
    doclaynet_test = load_dataset(
        "docling-project/DocLayNet-v1.1",
        split="test"
    )
    print("Total test pages:", len(doclaynet_test))

    print("üîç Filtering finance documents...")
    finance_test = doclaynet_test.filter(
        lambda row: row["metadata"].get("doc_category") == "financial_reports"
    )

    print("Finance pages found:", len(finance_test))

    print(f"üíæ Saving subset to: {SAVE_DIR}")
    finance_test.save_to_disk(SAVE_DIR)

    return finance_test


# ---- Use it ----
finance_test = get_finance_test_dataset()

# Example access
sample = finance_test[0]
print("Sample image size:", sample["image"].size)


In [None]:
import paddle
print(paddle.__version__)
print("CUDA:", paddle.device.is_compiled_with_cuda())
print("Device:", paddle.device.get_device())


In [1]:
from paddleocr import PPStructureV3

def build_ppstructurev3_for_layout(device: str = None) -> PPStructureV3:
    """
    Create a PPStructureV3 pipeline focused on layout detection.
    We disable heavy modules (tables, formulas, charts, seals) to save time.
    """
    kwargs = dict(
        # turn off optional heavy stuff
        use_doc_orientation_classify=False,
        use_doc_unwarping=False,
        use_textline_orientation=False,
        use_seal_recognition=False,
        use_table_recognition=False,
        use_formula_recognition=False,
        use_chart_recognition=False,
        use_region_detection=True,   # keep region detection ON
    )
    if device is not None:
        kwargs["device"] = device   # e.g. "gpu:0" or "cpu"

    pipeline = PPStructureV3(**kwargs)
    return pipeline

import numpy as np

def xywh_to_xyxy(box_xywh):
    x, y, w, h = box_xywh
    return np.array([x, y, x + w, y + h], dtype=float)

def iou_xyxy(box_a, box_b):
    """
    box_a, box_b: np.array [x1, y1, x2, y2]
    """
    ax1, ay1, ax2, ay2 = box_a
    bx1, by1, bx2, by2 = box_b

    inter_x1 = max(ax1, bx1)
    inter_y1 = max(ay1, by1)
    inter_x2 = min(ax2, bx2)
    inter_y2 = min(ay2, by2)

    inter_w = max(0.0, inter_x2 - inter_x1)
    inter_h = max(0.0, inter_y2 - inter_y1)
    inter_area = inter_w * inter_h

    area_a = max(0.0, (ax2 - ax1)) * max(0.0, (ay2 - ay1))
    area_b = max(0.0, (bx2 - bx1)) * max(0.0, (by2 - by1))

    union = area_a + area_b - inter_area
    if union <= 0:
        return 0.0
    return inter_area / union

def get_layout_boxes_from_result(result_obj):
    """
    Given a single PPStructureV3 Result object, return:
      list of predicted boxes: np.array [x1, y1, x2, y2]
      list of labels: str
      list of scores: float
    """
    j = result_obj.json  # dict, see PaddleOCR docs

    # Some versions wrap everything under 'res', some directly in the root.
    if "layout_det_res" in j:
        layout_dict = j["layout_det_res"]
    elif "res" in j and "layout_det_res" in j["res"]:
        layout_dict = j["res"]["layout_det_res"]
    else:
        # no layout detection result
        return [], [], []

    preds_xyxy = []
    pred_labels = []
    pred_scores = []

    for box in layout_dict.get("boxes", []):
        coords = box["coordinate"]  # [x1, y1, x2, y2]
        label = box.get("label", "")
        score = float(box.get("score", 0.0))
        preds_xyxy.append(np.array(coords, dtype=float))
        pred_labels.append(label)
        pred_scores.append(score)

    return preds_xyxy, pred_labels, pred_scores

def match_predictions_to_targets(pred_boxes, gt_boxes, iou_thresh=0.5):
    """
    Class-agnostic greedy matching.
    pred_boxes, gt_boxes: list of np.array [x1,y1,x2,y2]

    Returns:
        tp, fp, fn  (for this image)
    """
    if len(pred_boxes) == 0 and len(gt_boxes) == 0:
        return 0, 0, 0
    if len(pred_boxes) == 0:
        return 0, 0, len(gt_boxes)
    if len(gt_boxes) == 0:
        return 0, len(pred_boxes), 0

    gt_used = [False] * len(gt_boxes)
    tp = 0
    fp = 0

    for pb in pred_boxes:
        best_iou = 0.0
        best_gt_idx = -1
        for gi, gb in enumerate(gt_boxes):
            if gt_used[gi]:
                continue
            iou = iou_xyxy(pb, gb)
            if iou > best_iou:
                best_iou = iou
                best_gt_idx = gi

        if best_iou >= iou_thresh and best_gt_idx >= 0:
            tp += 1
            gt_used[best_gt_idx] = True
        else:
            fp += 1

    fn = sum(not u for u in gt_used)
    return tp, fp, fn

import numpy as np
from tqdm import tqdm

def evaluate_ppstructurev3_on_doclaynet(
    test_ds,
    pipeline,
    max_samples: int = None,
    iou_thresh: float = 0.5,
):
    """
    Class-agnostic layout detection evaluation on a DocLayNet-style dataset.

    Expected fields in each row:
      - 'image': PIL.Image
      - 'bboxes_block': list of [x, y, w, h] (GT)
    """
    total_tp = total_fp = total_fn = 0
    n = len(test_ds)
    if max_samples is not None:
        n = min(n, max_samples)

    for i in tqdm(range(n), desc="Evaluating PPStructureV3"):
        sample = test_ds[i]
        pil_img = sample["image"]           # PIL.Image

        # üî¥ IMPORTANT: convert to numpy.ndarray
        # PPStructureV3 accepts np.ndarray (H, W, C), typically BGR.
        img_np = np.array(pil_img.convert("RGB"))  # RGB
        # If you want strict BGR like cv2: uncomment the next line
        # img_np = img_np[:, :, ::-1]

        gt_bboxes_xywh = sample["bboxes"]
        gt_boxes_xyxy = [xywh_to_xyxy(b) for b in gt_bboxes_xywh]

        # Run PPStructureV3 on numpy.ndarray (not PIL)
        res_list = pipeline.predict(img_np)

        if not res_list:
            pred_boxes_xyxy = []
        else:
            result = res_list[0]
            pred_boxes_xyxy, pred_labels, pred_scores = get_layout_boxes_from_result(result)

        tp, fp, fn = match_predictions_to_targets(pred_boxes_xyxy, gt_boxes_xyxy, iou_thresh)
        total_tp += tp
        total_fp += fp
        total_fn += fn

    precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0.0
    recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

    metrics = {
        "iou_thresh": iou_thresh,
        "tp": total_tp,
        "fp": total_fp,
        "fn": total_fn,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }
    return metrics


RuntimeError: module compiled against ABI version 0x1000009 but this version of numpy is 0x2000000

RuntimeError: module compiled against ABI version 0x1000009 but this version of numpy is 0x2000000

ImportError: numpy.core.multiarray failed to import

In [None]:
from datasets import load_dataset

# 2) Build PPStructureV3
pipeline = build_ppstructurev3_for_layout(device="cpu")  # or "cpu"


# 3) Evaluate (optionally limit samples for quick sanity check)
metrics = evaluate_ppstructurev3_on_doclaynet(
    finance_test,
    pipeline,
    max_samples=20,   # None to use all pages
    iou_thresh=0.5,
)

print("PPStructureV3 on DocLayNet finance test:")
for k, v in metrics.items():
    print(f"{k}: {v}")
