In [None]:
from paddleocr import LayoutDetection, TableCellsDetection, TextDetection, TextRecognition




In [None]:
def get_layout_model():
    return LayoutDetection(model_name="PP-DocLayoutV2")

def get_table_model():
    return TableCellsDetection(model_name="RT-DETR-L_wired_table_cell_det")

def get_text_det_model():
    return TextDetection(device="gpu",model_name="PP-OCRv5_server_det")

def get_text_rec_model():
    return TextRecognition(device="gpu", model_name="PP-OCRv5_mobile_rec")




In [None]:
import paddle

print("Paddle version:", paddle.__version__)
print("Is compiled with CUDA:", paddle.is_compiled_with_cuda())
print("CUDA device count:", paddle.device.cuda.device_count())
print("Current device:", paddle.get_device())


In [None]:
img_path = r"/media/tom/Code/pcb_defect/ProdVision_Server/media/data_test/1507.05717v1_page-0005.jpg"
layout_model = get_layout_model()
table_model = get_table_model()
text_det_model = get_text_det_model()
text_rec_model = get_text_rec_model()



# analysis layout
- extract object paragrap
- merge paragrap by ( y)

In [None]:
import cv2

def extract_patches(layout, img_path):

    img = cv2.imread(img_path)
    data = layout[0]["boxes"]

    # Tạo tất cả patch
    patches = [{
        "patch_id": i,
        "patch_label": bbox["label"],
        "patch_positition": [int(c) for c in bbox["coordinate"]],
        "patch_score": bbox["score"],
        "patch": img[int(bbox["coordinate"][1]):int(bbox["coordinate"][3]),
                     int(bbox["coordinate"][0]):int(bbox["coordinate"][2])]
    } for i, bbox in enumerate(data)]

    # Hàm merge các patch text liên tiếp
    def merge_text_patches(patch_group):
        xs = [x for p in patch_group for x in (p["patch_positition"][0], p["patch_positition"][2])]
        ys = [y for p in patch_group for y in (p["patch_positition"][1], p["patch_positition"][3])]
        scores = [p["patch_score"] for p in patch_group]

        new_position = [min(xs), min(ys), max(xs), max(ys)]
        x1, y1, x2, y2 = map(int, new_position)
        merged_patch = img[y1:y2, x1:x2]

        return {
            "patch_id": patch_group[0]["patch_id"],
            "patch_label": "text",
            "patch_positition": new_position,
            "patch_score": max(scores),
            "patch": merged_patch
        }

    # Merge patch text liên tiếp
    finally_patches = []
    temp_patches = []

    for patch in patches:
        if patch["patch_label"] == "text":
            temp_patches.append(patch)
        else:
            if temp_patches:
                finally_patches.append(merge_text_patches(temp_patches))
                temp_patches = []
            finally_patches.append(patch)

    if temp_patches:
        finally_patches.append(merge_text_patches(temp_patches))

    return finally_patches
# patch_id

# sort list poly

# warped polys to patches

In [None]:
import cv2
import numpy as np

def warp_polys_to_patches(text_det):
    """
    Từ kết quả text detection, sort polygon theo trục y, warp từng polygon thành patch chữ nhật.
    
    Args:
        text_det: output của text_det_model.predict, dạng list/dict như text_det[0]
    
    Returns:
        warped_patches: list các dict, mỗi dict gồm:
            - 'patch': patch ảnh đã warp
            - 'bboxes': bbox trong patch
            - 'poly': polygon sau warp (int)
    """
    list_poly = text_det[0]["dt_polys"]
    img_input = text_det[0]["input_img"]

    centers_y = np.array([np.mean(np.array(poly)[:,1]) for poly in list_poly])
    sorted_idx = np.argsort(centers_y)
    list_poly_sorted = [list_poly[i] for i in sorted_idx]

    warped_patches = []

    for poly in list_poly_sorted:
        poly = np.array(poly, dtype=np.float32)

        if len(poly) > 4:
            hull = cv2.convexHull(poly).squeeze()
            src_pts = hull[:4] if hull.shape[0] >= 4 else hull
        else:
            src_pts = poly

        while src_pts.shape[0] < 4:
            src_pts = np.vstack([src_pts, src_pts[-1]])

        min_x, min_y = src_pts.min(axis=0)
        max_x, max_y = src_pts.max(axis=0)
        width = int(max_x - min_x)
        height = int(max_y - min_y)
        dst_pts = np.array([[0,0],[width-1,0],[width-1,height-1],[0,height-1]], dtype=np.float32)

        M = cv2.getPerspectiveTransform(src_pts, dst_pts)
        warped = cv2.warpPerspective(img_input, M, (width, height))

        poly_warped = cv2.perspectiveTransform(src_pts.reshape(-1,1,2), M).reshape(-1,2)
        min_x_w, min_y_w = poly_warped.min(axis=0)
        max_x_w, max_y_w = poly_warped.max(axis=0)
        bbox = [int(min_x_w), int(min_y_w), int(max_x_w), int(max_y_w)]

        warped_patches.append({
            "patch": warped,
            "bboxes": bbox,
            "poly": poly_warped.astype(int)
        })

    return warped_patches





# Text recognization

In [None]:

def recognize_text_from_patches(warped_patches, text_rec_model, batch_size=1):
    """
    Duyệt qua danh sách patch, predict text và trả về list dict gồm text, bbox, score, font.

    Args:
        warped_patches: list các dict, mỗi dict gồm 'patch', 'bboxes', 'poly'
        text_rec_model: model nhận dạng text
        batch_size: batch size khi predict

    Returns:
        text_patch: list dict, mỗi dict gồm 'text', 'bbox', 'score', 'front'
    """
    text_patch = []

    for patch_dict in warped_patches:
        img = patch_dict['patch']
        text_lines = text_rec_model.predict(img, batch_size=batch_size)

        result = {
            "text": text_lines[0]["rec_text"],
            "bbox": patch_dict['bboxes'],
            "score": text_lines[0]["rec_score"],
            "front": text_lines[0]["vis_font"]
        }

        text_patch.append(result)

    return text_patch




# Table processing

In [None]:
def poly_to_bbox(poly):
    """
    Chuyển polygon (N,2) thành bbox (x_min, y_min, x_max, y_max)
    """
    poly = np.array(poly)
    x_min = np.min(poly[:, 0])
    y_min = np.min(poly[:, 1])
    x_max = np.max(poly[:, 0])
    y_max = np.max(poly[:, 1])
    return x_min, y_min, x_max, y_max


def extract_table_text_from_patch(table_patch, table_model, text_det_model, text_rec_model):
    """
    Hàm trích xuất text từ các ô trong bảng dựa trên patch ảnh.

    Args:
        table_patch (np.array): patch ảnh chứa bảng.
        table_model: model dự đoán bảng.
        text_det_model: model phát hiện text trong cell.
        text_rec_model: model nhận diện text.

    Returns:
        list[dict]: danh sách kết quả với cell_bbox, text, score, font.
    """
    result = []

    cells = table_model.predict(table_patch)

    for cell in cells[0]["boxes"]:
        x_min, y_min, x_max, y_max = cell["coordinate"]
        bbox_cell = [x_min, y_min, x_max, y_max]
        x_min, y_min, x_max, y_max = map(int, [x_min, y_min, x_max, y_max])
        patch = table_patch[y_min:y_max, x_min:x_max]

        line_text_det = text_det_model.predict(patch)
        polys = line_text_det[0]["dt_polys"]
        line_img = line_text_det[0]["input_img"]

        for poly in polys:
            bbox = poly_to_bbox(poly=poly)
            x_min, y_min, x_max, y_max = map(int, bbox)
            line_patch = line_img[y_min:y_max, x_min:x_max]
            text_info = text_rec_model.predict(line_patch)[0]

            result.append({
                "cell_bbox": bbox_cell,
                "text": text_info["rec_text"],
                "score": text_info["rec_score"],
                "front": text_info["vis_font"]
            })

    return result
import numpy as np
def reconstruct_table_from_result(result, y_threshold=5):
    """
    Tái tạo bảng 2D từ kết quả extract_table_text_from_patch,
    dùng threshold để gom các cell cùng hàng.

    Args:
        result (list[dict]): danh sách các ô với bbox và text
        y_threshold (int): khoảng cách tối đa về y để coi cùng hàng

    Returns:
        list[list[str]]: bảng 2D
    """
    cells = []
    for item in result:
        cell_bbox = tuple(item["cell_bbox"])
        text = item["text"]
        cells.append((cell_bbox, text))
    
    # sort theo y_min, x_min
    cells.sort(key=lambda x: (x[0][1], x[0][0]))

    table = []
    current_row = []
    current_y_min = None

    for bbox, text in cells:
        x_min, y_min, x_max, y_max = bbox
        if current_y_min is None:
            current_y_min = y_min
        # Nếu cell lệch quá nhiều về y -> hàng mới
        if y_min - current_y_min > y_threshold:
            table.append(current_row)
            current_row = []
            current_y_min = y_min
        current_row.append((x_min, text))  # lưu x_min để sort cột sau

    if current_row:
        table.append(current_row)

    # sort từng row theo x_min và chỉ lấy text
    table_2d = []
    for row in table:
        row_sorted = [text for x, text in sorted(row, key=lambda x: x[0])]
        table_2d.append(row_sorted)

    return table_2d



In [None]:
import pandas as pd

def table_2d_to_df(table_2d):
    """
    Chuyển bảng 2D thành pandas DataFrame.

    Args:
        table_2d (list[list[str]]): bảng 2D

    Returns:
        pd.DataFrame
    """
    # Tìm số cột lớn nhất
    max_cols = max(len(row) for row in table_2d)
    
    # Bổ sung các cell trống nếu row ngắn hơn
    normalized_table = [row + [""]*(max_cols - len(row)) for row in table_2d]
    
    return pd.DataFrame(normalized_table)


In [112]:
def df_to_list_of_dict(df):
    """
    Chuyển pandas DataFrame thành list of dict.

    Args:
        df (pd.DataFrame): bảng dữ liệu

    Returns:
        list[dict]: mỗi dict là một row
    """
    # Lấy tên cột từ DataFrame
    columns = df.columns.tolist()
    
    # convert từng row thành dict
    return df.to_dict(orient="records")


In [None]:
def process_document(
    img_path,
    layout_model,
    text_det_model,
    text_rec_model,
    table_model
):

    # 1. Layout detection
    layout_result = layout_model.predict(
        img_path,
        batch_size=1,
        layout_nms=True
    )

    # 2. Merge layout patches
    merged_patches = extract_patches(layout_result, img_path)

    document_paragraphs = []

    for patch_info in merged_patches:
        patch_img = patch_info["patch"]
        patch_idx = patch_info["patch_id"]
        patch_type = patch_info["patch_label"]
        patch_coords = patch_info["patch_positition"]
        patch_conf = patch_info["patch_score"]

        # 3. Process by patch type
        if patch_type == "table":
            table_result = extract_table_text_from_patch(table_patch=patch_img, 
                                                         table_model=table_model, 
                                                         text_det_model=text_det_model, 
                                                         text_rec_model=text_rec_model)
            # print(table_result)
            table_2d = reconstruct_table_from_result(table_result)
            df = table_2d_to_df(table_2d)
            recognized_text =  df_to_list_of_dict(df)
            # df.to_csv("/media/tom/Code/pcb_defect/ProdVision_Server/chats/outputs/tabel.csv")    

        else:
            det_result = text_det_model.predict(
                patch_img,
                batch_size=1
            )

            rectified_patches = warp_polys_to_patches(
                det_result
            )

            recognized_text = recognize_text_from_patches(
                rectified_patches,
                text_rec_model
            )

        # 4. Collect result
        paragraph_entry = {
            "patch_id": patch_idx,
            "patch_label": patch_type,
            "patch_position": patch_coords,
            "patch_score": patch_conf,
            "text_dict": recognized_text
        }

        document_paragraphs.append(paragraph_entry)

    return document_paragraphs






In [114]:
img_path = "/media/tom/Code/pcb_defect/ProdVision_Server/media/data_test/1507.05717v1_page-0005.jpg"


# Access individual dimensions
document_paragraphs = process_document(
    img_path=img_path,
    layout_model=layout_model,
    text_det_model=text_det_model,
    text_rec_model=text_rec_model,
    table_model=table_model
)



[{'cell_bbox': [np.float32(6.2295895), np.float32(217.62062), np.float32(328.89517), np.float32(258.95178)], 'text': 'Convolution', 'score': 0.99945467710495, 'front': <paddlex.utils.fonts.Font object at 0x7b62eb911150>}, {'cell_bbox': [np.float32(6.253868), np.float32(258.80533), np.float32(328.8996), np.float32(300.03705)], 'text': 'MaxPooling', 'score': 0.9996495246887207, 'front': <paddlex.utils.fonts.Font object at 0x7b62eb911150>}, {'cell_bbox': [np.float32(6.068519), np.float32(176.4237), np.float32(328.90393), np.float32(217.78813)], 'text': 'Map-to-Sequence', 'score': 0.9996843338012695, 'front': <paddlex.utils.fonts.Font object at 0x7b62eb911150>}, {'cell_bbox': [np.float32(6.3051014), np.float32(340.86072), np.float32(328.90775), np.float32(382.14307)], 'text': 'Convolution', 'score': 0.9994373321533203, 'front': <paddlex.utils.fonts.Font object at 0x7b62eb911150>}, {'cell_bbox': [np.float32(6.1158514), np.float32(423.12305), np.float32(328.86877), np.float32(464.31818)], 't

In [115]:
document_paragraphs

[{'patch_id': 0,
  'patch_label': 'text',
  'patch_position': [195, 305, 1203, 1301],
  'patch_score': 0.9885912537574768,
  'text_dict': [{'text': 'where yi is the sequence produced by the recurrent and con-',
    'bbox': [0, 0, 988, 43],
    'score': 0.9841119050979614,
    'front': <paddlex.utils.fonts.Font at 0x7b62eb911150>},
   {'text': 'volutional layers from Ii. This objective function calculates',
    'bbox': [0, 0, 991, 47],
    'score': 0.982840359210968,
    'front': <paddlex.utils.fonts.Font at 0x7b62eb911150>},
   {'text': 'a cost value directly from an image and its ground truth',
    'bbox': [0, 0, 991, 43],
    'score': 0.9845973253250122,
    'front': <paddlex.utils.fonts.Font at 0x7b62eb911150>},
   {'text': 'label sequence. Therefore, the network can be end-to-end',
    'bbox': [0, 0, 991, 45],
    'score': 0.9842114448547363,
    'front': <paddlex.utils.fonts.Font at 0x7b62eb911150>},
   {'text': 'trained on pairs of images and sequences, eliminating the',
    'bbo

# data