In [1]:
from pdf2image import convert_from_path
from PIL import Image
from surya.layout import LayoutPredictor
from surya.texify import TexifyPredictor
import json
import re
import pprint

PDF_PATH = "test.pdf"

  Referenced from: <3FBC4DD9-431F-30F9-B747-F26A414408A9> /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/torchvision/image.so
  warn(


In [2]:
def parse_bbox_data(input_text):
    """Parse the bounding box text data into a structured dictionary."""

    # Extract the main bboxes list using regex
    bbox_pattern = r"LayoutBox\(polygon=(\[\[.*?\]\]).*?confidence=([\d\.]+).*?label=\'(.*?)\'.*?position=(\d+).*?top_k=({.*?}).*?bbox=(\[.*?\])"
    bboxes_data = re.findall(bbox_pattern, input_text, re.DOTALL)

    # Extract image_bbox and sliced values
    image_bbox_match = re.search(r"image_bbox=(\[.*?\])", input_text)
    image_bbox = json.loads(image_bbox_match.group(1)) if image_bbox_match else None

    sliced_match = re.search(r"sliced=(True|False)", input_text)
    sliced = sliced_match.group(1) == "True" if sliced_match else None

    # Process each bounding box
    bboxes = []
    for polygon_str, confidence, label, position, top_k_str, bbox_str in bboxes_data:
        # Clean up and parse the data
        polygon = json.loads(polygon_str.replace("'", '"'))
        confidence = float(confidence)
        position = int(position)

        # Convert top_k string to dict
        top_k_str = top_k_str.replace("'", '"')
        top_k = json.loads(top_k_str)

        # Parse bbox
        bbox = json.loads(bbox_str)

        # Create the box dictionary
        box = {
            "polygon": polygon,
            "confidence": confidence,
            "label": label,
            "position": position,
            "top_k": top_k,
            "bbox": bbox,
        }
        bboxes.append(box)

    # Create final structure
    result = {"bboxes": bboxes}

    if image_bbox:
        result["image_bbox"] = image_bbox

    if sliced is not None:
        result["sliced"] = sliced

    return result

In [3]:
# Convert PDF pages to images
images = convert_from_path(PDF_PATH)
print(f"Number of pages {len(images)}")

layout_predictor = LayoutPredictor()
# Process each page image
# list of dict, one per image => one per page
layout_predictions = layout_predictor(images)

Number of pages 66
Loaded layout model s3://layout/2025_02_18 on device mps with dtype torch.float16


Recognizing layout: 100%|██████████| 17/17 [00:25<00:00,  1.47s/it]


In [4]:
pages = []
for i, prediction in enumerate(layout_predictions):
    # need a way to convert to json
    parsed = parse_bbox_data(str(prediction))
    pages.append(parsed)

    print(f"Page {i}: {pages[i]}")

Page 0: {'bboxes': [{'polygon': [[138.796875, 110.5576171875], [875.109375, 110.5576171875], [875.109375, 201.3662109375], [138.796875, 201.3662109375]], 'confidence': 0.9970703125, 'label': 'SectionHeader', 'position': 0, 'top_k': {'SectionHeader': 0.9970703125, 'Picture': 0.0008144378662109375, 'PageHeader': 0.000766754150390625, 'Text': 0.0007538795471191406, 'Figure': 0.00020194053649902344}, 'bbox': [138.796875, 110.5576171875, 875.109375, 201.3662109375]}, {'polygon': [[388.3359375, 281.28515625], [623.6015625, 281.28515625], [623.6015625, 317.4609375], [388.3359375, 317.4609375]], 'confidence': 0.99951171875, 'label': 'SectionHeader', 'position': 1, 'top_k': {'SectionHeader': 0.99951171875, 'Text': 0.00021886825561523438, 'Picture': 7.069110870361328e-05, 'PageHeader': 4.494190216064453e-05, 'Figure': 3.153085708618164e-05}, 'bbox': [388.3359375, 281.28515625, 623.6015625, 317.4609375]}, {'polygon': [[104.8359375, 356.7744140625], [904.1484375, 356.7744140625], [904.1484375, 383

In [11]:
# get math regions
math_regions = {}  # dictionary on page number
for i in range(len(pages)):
    page_content = pages[i]
    bboxes_page = list(page_content["bboxes"])
    for bbox in bboxes_page:
        if bbox.get("label") in ["TextInLineMath", "Equation"]:
            if math_regions.get(i) is None:
                math_regions[i] = []
            math_regions[i].append(bbox)
    if math_regions.get(i) is not None:
        print(f"Page {i}: {len(math_regions[i])}")

Page 12: 1
Page 13: 1
Page 14: 2
Page 18: 1
Page 28: 1
Page 29: 1
Page 30: 2
Page 31: 1
Page 32: 2
Page 33: 2
Page 35: 1
Page 36: 1
Page 37: 1
Page 39: 1
Page 43: 1
Page 44: 2
Page 45: 2
Page 52: 1
Page 53: 1
Page 54: 1
Page 55: 1
Page 56: 1
Page 64: 1
Page 65: 1


In [12]:
# Crop images based on bounding boxes
cropped_images = []
for page_index in math_regions.keys():
    page_image = images[page_index]
    page_regions = math_regions[page_index] # regions of a page
    print(page_regions)
    for region in page_regions:
        left, top, right, bottom = region["bbox"]
        cropped_image = page_image.crop((left, top, right, bottom))
        cropped_images.append(cropped_image)

# Save cropped images
for i, cropped_image in enumerate(cropped_images):
    cropped_image.save(f"cropped_image_{i}.png")

[{'polygon': [[362.25, 353.63671875], [642.3046875, 353.63671875], [642.3046875, 394.611328125], [362.25, 394.611328125]], 'confidence': 0.99951171875, 'label': 'Equation', 'position': 2, 'top_k': {'Equation': 0.99951171875, 'TextInlineMath': 0.00037550926208496094, 'SectionHeader': 0.00013387203216552734, 'Handwriting': 1.055002212524414e-05, 'Code': 5.245208740234375e-06}, 'bbox': [362.25, 353.63671875, 642.3046875, 394.611328125]}]
[{'polygon': [[92.77734375, 364.1572265625], [818.015625, 364.1572265625], [818.015625, 439.27734375], [92.77734375, 439.27734375]], 'confidence': 0.99267578125, 'label': 'Equation', 'position': 3, 'top_k': {'Equation': 0.99267578125, 'TextInlineMath': 0.0040740966796875, 'ListItem': 0.0025882720947265625, 'Handwriting': 0.00023615360260009766, 'Figure': 0.00020122528076171875}, 'bbox': [92.77734375, 364.1572265625, 818.015625, 439.27734375]}]
[{'polygon': [[92.53125, 364.1572265625], [818.5078125, 364.1572265625], [818.5078125, 442.23046875], [92.53125, 

**Math formula detection**

In [20]:
import glob

# get all cropped image files in the current folder (e.g., files starting with "cropped_image_")
image_paths = glob.glob("cropped_image_*.png")

predictor = TexifyPredictor()
results = []

for path in image_paths:
    image = Image.open(path)
    result = predictor([image])
    results.append(result)

Loaded texify model s3://texify/2025_02_18 on device mps with dtype torch.float16


Texify inference: 100%|██████████| 1/1 [00:03<00:00,  3.34s/it]
Texify inference: 100%|██████████| 1/1 [00:04<00:00,  4.79s/it]
Texify inference: 100%|██████████| 1/1 [00:03<00:00,  3.48s/it]
Texify inference: 100%|██████████| 1/1 [00:10<00:00, 10.51s/it]
Texify inference: 100%|██████████| 1/1 [00:08<00:00,  8.88s/it]
Texify inference: 100%|██████████| 1/1 [00:07<00:00,  7.25s/it]
Texify inference: 100%|██████████| 1/1 [00:03<00:00,  3.20s/it]
Texify inference: 100%|██████████| 1/1 [00:03<00:00,  3.41s/it]
Texify inference: 100%|██████████| 1/1 [00:04<00:00,  5.00s/it]
Texify inference: 100%|██████████| 1/1 [00:03<00:00,  3.26s/it]
Texify inference: 100%|██████████| 1/1 [00:03<00:00,  3.01s/it]
Texify inference: 100%|██████████| 1/1 [00:03<00:00,  3.18s/it]
Texify inference: 100%|██████████| 1/1 [00:04<00:00,  4.21s/it]
Texify inference: 100%|██████████| 1/1 [00:04<00:00,  4.02s/it]
Texify inference: 100%|██████████| 1/1 [00:01<00:00,  1.53s/it]
Texify inference: 100%|██████████| 1/1 [

In [15]:
results

[[TexifyResult(text='<math display="block">p_{\\mathcal{A}}(t) = (t - \\lambda_1)^{\\eta_1} (t - \\lambda_2)^{\\eta_2} \\cdots (t - \\lambda_r)^{\\eta_r},</math>', confidence=0.98974609375)],
 [TexifyResult(text='<math display="block">P = (\\boldsymbol{u}_{11}^{\\sharp}\\boldsymbol{u}_{12}^{\\sharp}\\cdots\\boldsymbol{u}_{1n_1}^{\\sharp}\\cdots\\boldsymbol{u}_{r1}^{\\sharp}\\boldsymbol{u}_{r2}^{\\sharp}\\cdots\\boldsymbol{u}_{rn_r}^{\\sharp}),</math>', confidence=0.96435546875)],
 [TexifyResult(text='<math display="block">\n\\rho_{\\mathcal{A}}(t) = (t - \\lambda_1)^{\\eta_1} (t - \\lambda_2)^{\\eta_2} \\cdots (t - \\lambda_r)^{\\eta_r},\n</math>', confidence=0.98876953125)],
 [TexifyResult(text='<math display="block">\\begin{aligned} \\bullet \\ A &= \\begin{pmatrix} 3 & 2 & 0 \\\\ 2 & 2 & 2 \\\\ 0 & 2 & 1 \\end{pmatrix}. \\\\ \\bullet \\ A &= \\begin{pmatrix} 3 & 2 & 2 \\\\ 2 & 3 & -1 \\\\ 2 & -1 & 0 \\end{pmatrix}. \\\\ \\bullet \\ A &= \\begin{pmatrix} 1 & -3 & -1 \\\\ -3 & 1 & 1 \