In [1]:
from paddleocr import LayoutDetection, TableCellsDetection, TextDetection, TextRecognition




  from .autonotebook import tqdm as notebook_tqdm
[33mChecking connectivity to the model hosters, this may take a while. To bypass this check, set `DISABLE_MODEL_SOURCE_CHECK` to `True`.[0m


In [2]:
def get_layout_model():
    return LayoutDetection(model_name="PP-DocLayoutV2")

def get_table_model():
    return TableCellsDetection(model_name="RT-DETR-L_wired_table_cell_det")

def get_text_det_model():
    return TextDetection(device="gpu",model_name="PP-OCRv5_server_det")

def get_text_rec_model():
    return TextRecognition(device="gpu", model_name="PP-OCRv5_mobile_rec")




In [3]:
import paddle

print("Paddle version:", paddle.__version__)
print("Is compiled with CUDA:", paddle.is_compiled_with_cuda())
print("CUDA device count:", paddle.device.cuda.device_count())
print("Current device:", paddle.get_device())




Paddle version: 3.2.0
Is compiled with CUDA: True
CUDA device count: 1
Current device: gpu:0


In [4]:
img_path = r"/media/tom/Code/pcb_defect/ProdVision_Server/media/data_test/1507.05717v1_page-0005.jpg"
layout_model = get_layout_model()
table_model = get_table_model()
text_det_model = get_text_det_model()
text_rec_model = get_text_rec_model()

# layout = layout_model.predict(img_path, batch_size=1, layout_nms=True)

# tables = table_model.predict(img_path, threshold=0.3, batch_size=1)

# text_det = text_det_model.predict(img_path, batch_size=1)

# text_lines = text_rec_model.predict(text_det, batch_size=1)





[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/home/tom/.paddlex/official_models/PP-DocLayoutV2`.[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/home/tom/.paddlex/official_models/RT-DETR-L_wired_table_cell_det`.[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/home/tom/.paddlex/official_models/PP-OCRv5_server_det`.[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `/home/tom/.paddlex/official_models/PP-OCRv5_mobile_rec`.[0m


# analysis layout
- extract object paragrap
- merge paragrap by ( y)

In [5]:
import cv2

def extract_patches(layout, img_path):

    img = cv2.imread(img_path)
    data = layout[0]["boxes"]

    # Tạo tất cả patch
    patches = [{
        "patch_id": i,
        "patch_label": bbox["label"],
        "patch_positition": [int(c) for c in bbox["coordinate"]],
        "patch_score": bbox["score"],
        "patch": img[int(bbox["coordinate"][1]):int(bbox["coordinate"][3]),
                     int(bbox["coordinate"][0]):int(bbox["coordinate"][2])]
    } for i, bbox in enumerate(data)]

    # Hàm merge các patch text liên tiếp
    def merge_text_patches(patch_group):
        xs = [x for p in patch_group for x in (p["patch_positition"][0], p["patch_positition"][2])]
        ys = [y for p in patch_group for y in (p["patch_positition"][1], p["patch_positition"][3])]
        scores = [p["patch_score"] for p in patch_group]

        new_position = [min(xs), min(ys), max(xs), max(ys)]
        x1, y1, x2, y2 = map(int, new_position)
        merged_patch = img[y1:y2, x1:x2]

        return {
            "patch_id": patch_group[0]["patch_id"],
            "patch_label": "text",
            "patch_positition": new_position,
            "patch_score": max(scores),
            "patch": merged_patch
        }

    # Merge patch text liên tiếp
    finally_patches = []
    temp_patches = []

    for patch in patches:
        if patch["patch_label"] == "text":
            temp_patches.append(patch)
        else:
            if temp_patches:
                finally_patches.append(merge_text_patches(temp_patches))
                temp_patches = []
            finally_patches.append(patch)

    if temp_patches:
        finally_patches.append(merge_text_patches(temp_patches))

    return finally_patches
# patch_id

# sort list poly

# warped polys to patches

In [6]:
import cv2
import numpy as np

def warp_polys_to_patches(text_det):
    """
    Từ kết quả text detection, sort polygon theo trục y, warp từng polygon thành patch chữ nhật.
    
    Args:
        text_det: output của text_det_model.predict, dạng list/dict như text_det[0]
    
    Returns:
        warped_patches: list các dict, mỗi dict gồm:
            - 'patch': patch ảnh đã warp
            - 'bboxes': bbox trong patch
            - 'poly': polygon sau warp (int)
    """
    list_poly = text_det[0]["dt_polys"]
    img_input = text_det[0]["input_img"]

    centers_y = np.array([np.mean(np.array(poly)[:,1]) for poly in list_poly])
    sorted_idx = np.argsort(centers_y)
    list_poly_sorted = [list_poly[i] for i in sorted_idx]

    warped_patches = []

    for poly in list_poly_sorted:
        poly = np.array(poly, dtype=np.float32)

        if len(poly) > 4:
            hull = cv2.convexHull(poly).squeeze()
            src_pts = hull[:4] if hull.shape[0] >= 4 else hull
        else:
            src_pts = poly

        while src_pts.shape[0] < 4:
            src_pts = np.vstack([src_pts, src_pts[-1]])

        min_x, min_y = src_pts.min(axis=0)
        max_x, max_y = src_pts.max(axis=0)
        width = int(max_x - min_x)
        height = int(max_y - min_y)
        dst_pts = np.array([[0,0],[width-1,0],[width-1,height-1],[0,height-1]], dtype=np.float32)

        M = cv2.getPerspectiveTransform(src_pts, dst_pts)
        warped = cv2.warpPerspective(img_input, M, (width, height))

        poly_warped = cv2.perspectiveTransform(src_pts.reshape(-1,1,2), M).reshape(-1,2)
        min_x_w, min_y_w = poly_warped.min(axis=0)
        max_x_w, max_y_w = poly_warped.max(axis=0)
        bbox = [int(min_x_w), int(min_y_w), int(max_x_w), int(max_y_w)]

        warped_patches.append({
            "patch": warped,
            "bboxes": bbox,
            "poly": poly_warped.astype(int)
        })

    return warped_patches





# Text recognization

In [7]:

def recognize_text_from_patches(warped_patches, text_rec_model, batch_size=1):
    """
    Duyệt qua danh sách patch, predict text và trả về list dict gồm text, bbox, score, font.

    Args:
        warped_patches: list các dict, mỗi dict gồm 'patch', 'bboxes', 'poly'
        text_rec_model: model nhận dạng text
        batch_size: batch size khi predict

    Returns:
        text_patch: list dict, mỗi dict gồm 'text', 'bbox', 'score', 'front'
    """
    text_patch = []

    for patch_dict in warped_patches:
        img = patch_dict['patch']
        text_lines = text_rec_model.predict(img, batch_size=batch_size)

        result = {
            "text": text_lines[0]["rec_text"],
            "bbox": patch_dict['bboxes'],
            "score": text_lines[0]["rec_score"],
            "front": text_lines[0]["vis_font"]
        }

        text_patch.append(result)

    return text_patch




# Table processing

In [None]:
def extract_table_fast(patch, table_model,text_det_model, text_rec_model):
    

In [None]:

def process_document(
    img_path,
    layout_model,
    text_det_model,
    text_rec_model,
    table_model
):

    # 1. Layout detection
    layout_result = layout_model.predict(
        img_path,
        batch_size=1,
        layout_nms=True
    )

    # 2. Merge layout patches
    merged_patches = extract_patches(layout_result, img_path)

    document_paragraphs = []

    for patch_info in merged_patches:
        patch_img = patch_info["patch"]
        patch_idx = patch_info["patch_id"]
        patch_type = patch_info["patch_label"]
        patch_coords = patch_info["patch_positition"]
        patch_conf = patch_info["patch_score"]

        # 3. Process by patch type
        if patch_type == "table":
            table_result = extract_table_fast(
                img=patch_img,
                table_det_model=table_model,
                text_det_model = text_det_model,
                text_rec_model=text_rec_model
            )
            import json
            print(table_result)

            # table_result['text_dict'] = table_result['text_dict'].apply(lambda x: json.loads(x) if isinstance(x, str) else x)


            recognized_text = df_to_json_label_value(table_result)
        else:
            det_result = text_det_model.predict(
                patch_img,
                batch_size=1
            )

            rectified_patches = warp_polys_to_patches(
                det_result
            )

            recognized_text = recognize_text_from_patches(
                rectified_patches,
                text_rec_model
            )

        # 4. Collect result
        paragraph_entry = {
            "patch_id": patch_idx,
            "patch_label": patch_type,
            "patch_position": patch_coords,
            "patch_score": patch_conf,
            "text_dict": recognized_text
        }

        document_paragraphs.append(paragraph_entry)

    return document_paragraphs






In [33]:
img_path = "/media/tom/Code/pcb_defect/ProdVision_Server/media/data_test/1507.05717v1_page-0005.jpg"


# Access individual dimensions
document_paragraphs = process_document(
    img_path=img_path,
    layout_model=layout_model,
    text_det_model=text_det_model,
    text_rec_model=text_rec_model,
    table_model=table_model
)




                     0                        1                             2
0                 Type          Confi gurations                          None
1        Transcription                     None                          None
2   Bidirectional-LSTM        #hidden units:256                            TM
3   Bidirectional-LSTM        #hidden units:256                            TM
4      Map-to-Sequence                       ce                          None
5          Convolution                        #  #maps:512, k:2 × 2, s:1, p:0
6           MaxPooling        Window:1 × 2, s:2                          None
7   BatchNormalization                      ion                          None
8          Convolution                        #  #maps:512, k:3 × 3, s:1, p:1
9   BatchNormalization                      ion                          None
10         Convolution                        #  #maps:512, k:3 × 3, s:1, p:1
11          MaxPooling        Window:1 × 2, s:2                 

[{'patch_id': 0,
  'patch_label': 'text',
  'patch_position': [195, 305, 1203, 1301],
  'patch_score': 0.9885912537574768,
  'text_dict': [{'text': 'where yi is the sequence produced by the recurrent and con-',
    'bbox': [0, 0, 988, 43],
    'score': 0.9841119050979614,
    'front': <paddlex.utils.fonts.Font at 0x7b62eb911150>},
   {'text': 'volutional layers from Ii. This objective function calculates',
    'bbox': [0, 0, 991, 47],
    'score': 0.982840359210968,
    'front': <paddlex.utils.fonts.Font at 0x7b62eb911150>},
   {'text': 'a cost value directly from an image and its ground truth',
    'bbox': [0, 0, 991, 43],
    'score': 0.9845973253250122,
    'front': <paddlex.utils.fonts.Font at 0x7b62eb911150>},
   {'text': 'label sequence. Therefore, the network can be end-to-end',
    'bbox': [0, 0, 991, 45],
    'score': 0.9842114448547363,
    'front': <paddlex.utils.fonts.Font at 0x7b62eb911150>},
   {'text': 'trained on pairs of images and sequences, eliminating the',
    'bbo

# data

In [12]:
from PIL import Image, ImageDraw, ImageFont

def patch_to_img_text_data(patch_size, list_text):
    """
    patch_size: tuple (W,H)
    list_text: list of dict, mỗi dict có keys: text, bbox, front (có thể bỏ qua front)
    """
    W, H = patch_size
    patch = Image.new("RGB", (W, H), color=(255, 255, 255))
    draw = ImageDraw.Draw(patch)
    
    for text_ in list_text:
        text = text_.get("text", "")
        bbox_text_line = text_.get("bbox", [0,0,0,0])
        x0, y0, x1, y1 = bbox_text_line
        
        # thử lấy font từ front nếu có, nếu không dùng default
        try:
            font = ImageFont.truetype("arial.ttf", max(10, y1-y0))
        except:
            font = ImageFont.load_default()
        
        # vẽ chữ ở x0, y0
        draw.text((x0, y0), text, fill=(0,0,0), font=font)
    
    return patch




In [13]:

def decode_to_image(patch_list, image_size=(3300, 2550, 3)):
    H, W, C = image_size
    img = Image.new("RGB", (W, H), color=(255, 255, 255))
    draw = ImageDraw.Draw(img)
    
    for patch in patch_list:
        text_dict = patch.get('text_dict')
        if not text_dict:
            continue
        
       
    
    return img


# usage
img = decode_to_image(document_paragraphs)
img.show()


In [14]:
# import cv2

# img = cv2.imread("/media/tom/Code/pcb_defect/ProdVision_Server/media/data_test/1507.05717v1_page-0005.jpg")

# # Get the shape as a tuple (height, width, channels)
# image_shape = img.shape
# print(f"Image Shape (H, W, C): {image_shape}")

In [15]:
# from ocr.ocr_service import OCRService

# img_path = "/media/tom/Code/pcb_defect/ProdVision_Server/chats/output/1507.05717v1_page-0005_res.jpg"

# ocr = OCRService(device="gpu")

# result = ocr.process(img_path)

# print(result)

Created TensorFlow Lite XNNPACK delegate for CPU.
