In [None]:
import numpy as np
import matplotlib.pyplot as plt
import cv2

from models import OCRModels
from document_process import DocumentProcessor

# ======================
# 1. OCR
# ======================

models = OCRModels(device="gpu")
doc_processor = DocumentProcessor(models)

img_path = "/media/tom/Code/pcb_defect/ProdVision_Server/chats/outputs/image copy.png"
document_paragraphs = doc_processor.process_document(img_path)

document_paragraphs

In [None]:
import cv2
import numpy as np

def draw_text_patch_cv2(
    patch_position,
    text_dict,
    font=cv2.FONT_HERSHEY_SIMPLEX,
    font_scale=0.7,
    thickness=1,
    text_color=(0, 0, 0),
    padding=(10, 30),
    line_spacing=30
):
    """
    Vẽ text lên nền trắng theo patch_position bằng OpenCV

    Args:
        patch_position (list): [x_min, y_min, x_max, y_max]
        text_dict (list): list các dict chứa key 'text'
        font: cv2 font
        font_scale (float): scale font
        thickness (int): độ dày chữ
        text_color (tuple): màu chữ (B, G, R)
        padding (tuple): (x_padding, y_start)
        line_spacing (int): khoảng cách giữa các dòng

    Returns:
        img (np.ndarray): ảnh nền trắng đã vẽ text
    """

    x_min, y_min, x_max, y_max = patch_position
    width = x_max - x_min
    height = y_max - y_min

    # nền trắng
    img = np.ones((height, width, 3), dtype=np.uint8) * 255

    x_pad, y = padding

    for item in text_dict:
        text = item.get("text", "")
        cv2.putText(
            img,
            text,
            (x_pad, y),
            font,
            font_scale,
            text_color,
            thickness,
            cv2.LINE_AA
        )
        y += line_spacing

    return img


In [None]:
import cv2
import numpy as np

def draw_table_patch_cv2(
    patch_position,
    text_dict,
    font=cv2.FONT_HERSHEY_SIMPLEX,
    font_scale=0.6,
    thickness=1,
    text_color=(0, 0, 0),
    line_color=(0, 0, 0),
    cell_padding=8
):
    """
    Vẽ table lên nền trắng theo patch_position bằng OpenCV

    Args:
        patch_position (list): [x_min, y_min, x_max, y_max]
        text_dict (list): list các row, mỗi row là dict {col_idx: text}
        font: cv2 font
        font_scale (float)
        thickness (int)
        text_color (tuple): màu chữ (BGR)
        line_color (tuple): màu đường kẻ bảng
        cell_padding (int)

    Returns:
        img (np.ndarray)
    """

    x_min, y_min, x_max, y_max = patch_position
    width = x_max - x_min
    height = y_max - y_min

    # nền trắng
    img = np.ones((height, width, 3), dtype=np.uint8) * 255

    # số hàng, số cột
    rows = len(text_dict)
    cols = max(max(row.keys()) for row in text_dict) + 1

    row_h = height // rows
    col_w = width // cols

    # vẽ grid
    for r in range(rows + 1):
        y = r * row_h
        cv2.line(img, (0, y), (width, y), line_color, 1)

    for c in range(cols + 1):
        x = c * col_w
        cv2.line(img, (x, 0), (x, height), line_color, 1)

    # vẽ text từng ô
    for r, row in enumerate(text_dict):
        for c in range(cols):
            text = row.get(c, "")
            if not text:
                continue

            x_text = c * col_w + cell_padding
            y_text = r * row_h + row_h // 2

            cv2.putText(
                img,
                text,
                (x_text, y_text),
                font,
                font_scale,
                text_color,
                thickness,
                cv2.LINE_AA
            )

    return img


In [None]:
img = draw_table_patch_cv2(
    patch_position=document_paragraphs[0]["patch_position"],
    text_dict=document_paragraphs[0]["text_dict"]
)

cv2.imwrite("table_patch.png", img)


In [1]:
from paddleocr import TableRecognitionPipelineV2

pipeline = TableRecognitionPipelineV2()
# ocr = TableRecognitionPipelineV2(use_doc_orientation_classify=True) # Specify whether to use the document orientation classification model with use_doc_orientation_classify
# ocr = TableRecognitionPipelineV2(use_doc_unwarping=True) # Specify whether to use the text image unwarping module with use_doc_unwarping
# ocr = TableRecognitionPipelineV2(device="gpu") # Specify the device to use GPU for model inference



  from .autonotebook import tqdm as notebook_tqdm
[33mChecking connectivity to the model hosters, this may take a while. To bypass this check, set `DISABLE_MODEL_SOURCE_CHECK` to `True`.[0m
[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/home/tom/.paddlex/official_models/PP-LCNet_x1_0_doc_ori`.[0m
[32mCreating model: ('UVDoc', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/home/tom/.paddlex/official_models/UVDoc`.[0m
[32mCreating model: ('PP-DocLayout-L', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/home/tom/.paddlex/official_models/PP-DocLayout-L`.[0m
[32mCreating model: ('PP-LCNet_x1_0_table_cls', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/home/tom/.

In [4]:
output = pipeline.predict("/media/tom/Code/pcb_defect/ProdVision_Server/media/data_test/1507.05717v1_page-0008.jpg")
for res in output:
    res.print() ## Print the predicted structured output
    res.save_to_img("./output/")
    res.save_to_xlsx("./output/")
    res.save_to_html("./output/")
    res.save_to_json("./output/")

[32m{'res': {'input_path': '/media/tom/Code/pcb_defect/ProdVision_Server/media/data_test/1507.05717v1_page-0008.jpg', 'page_index': None, 'model_settings': {'use_doc_preprocessor': True, 'use_layout_detection': True, 'use_ocr_model': True}, 'doc_preprocessor_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_orientation_classify': True, 'use_doc_unwarping': True}, 'angle': 0}, 'layout_det_res': {'input_path': None, 'page_index': None, 'boxes': [{'cls_id': 2, 'label': 'text', 'score': 0.9867656230926514, 'coordinate': [np.float32(1329.9775), np.float32(242.50218), np.float32(2400.7954), np.float32(1077.8878)]}, {'cls_id': 2, 'label': 'text', 'score': 0.9853912591934204, 'coordinate': [np.float32(1318.7043), np.float32(1632.1577), np.float32(2410.1753), np.float32(2314.5957)]}, {'cls_id': 2, 'label': 'text', 'score': 0.9853912591934204, 'coordinate': [np.float32(179.79784), np.float32(1752.3248), np.float32(1229.8732), np.float32(2483.551)]}, {'cls_id': 2, 'label