In [1]:
from PIL import ImageDraw, ImageFont, Image

def pdf_bbox_to_img_bbox(bbox_pdf, scale_x, scale_y):
    x0, y0, x1, y1 = bbox_pdf
    return (
        int(x0 / scale_x),
        int(y0 / scale_y),
        int(x1 / scale_x),
        int(y1 / scale_y),
    )

def visualize_page(page, viz_items, dpi=300, save_path=None):
    # ---- render page ----
    pix = page.get_pixmap(dpi=dpi)
    img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)

    draw = ImageDraw.Draw(img)

    try:
        font = ImageFont.truetype("arial.ttf", 16)
    except:
        font = ImageFont.load_default()

    for item in viz_items:
        scale_x = item["scale_x"]
        scale_y = item["scale_y"]

        # ---- figure bbox ----
        fig_bbox = pdf_bbox_to_img_bbox(
            item["figure_bbox_pdf"], scale_x, scale_y
        )
        fig_color = "red" if item["figure_type"] == "Table" else "blue"

        draw.rectangle(fig_bbox, outline=fig_color, width=3)
        draw.text(
            (fig_bbox[0], fig_bbox[1] - 18),
            item["figure_type"],
            fill=fig_color,
            font=font,
        )

        if item["caption_bbox_pdf"] is not None:
            # ---- caption bbox ----
            cap_bbox = pdf_bbox_to_img_bbox(
                item["caption_bbox_pdf"], scale_x, scale_y
            )
    
            draw.rectangle(cap_bbox, outline="green", width=3)
    
            caption_preview = item["caption_text"][:60].replace("\n", " ")
            draw.text(
                (cap_bbox[0], cap_bbox[3] + 2),
                caption_preview,
                fill="green",
                font=font,
            )

    if save_path:
        img.save(save_path)

    return img



In [4]:
from app.pdf_agent import PDFTableExtractor
import fitz
from pathlib import Path

extractor = PDFTableExtractor(device="mps")

doc = fitz.open("data/downloads/manual_down/diffusion_policy.pdf")

for i, page in enumerate(doc):
    outs, viz_items = extractor.extract(page)
    if not viz_items:
        continue

    img = visualize_page(
        page,
        viz_items,
        dpi=extractor.dpi,
        save_path=f"data/extracted_tables/page_{i:03d}_viz.png",
    )

Downloaded to data/models/yolov11x_best.pt


Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at Fa


0: 640x512 3 List-items, 1 Page-header, 1 Picture, 1 Section-header, 8 Texts, 437.0ms
Speed: 3.0ms preprocess, 437.0ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 512)

Candidate 1
Text:
Fig. 1: Policy Representations. a) Explicit policy with different types of action representations. b) Implicit policy learns an energy function
conditioned on both action and observation and optimizes for actions that minimize the energy landscape c) Diffusion policy refines noise
into actions via a learned gradient field. This formulation provides stable training, allows the learned policy to accurately model multimodal
action distributions, and accommodates high-dimensional action sequences.
CLIP similarity : 0.3609
------------------------------------------------------------
Candidate 2
Text:
1 Columbia University
2 Toyota Research Institute
3 MIT
https://diffusion-policy.cs.columbia.edu
CLIP similarity : 0.2449
------------------------------------------------------------
Candidate 

Batches: 100%|██████████| 1/1 [00:00<00:00, 67.75it/s]



QUERY:
Which sentence is the title or caption describing the following table?

| pθ(a|o) | = e−Eθ |
| --- | --- |
|  | Z(o,θ) | 

Rank 1 | Score -0.5502
e−Eθ (o,a) +∑
Nneg
j=1e−Eθ (o,ea j) )
(7)
------------------------------------------------------------
Rank 2 | Score -3.5767
LinfoNCE = −log(
e−Eθ (o,a)
------------------------------------------------------------
Rank 3 | Score -8.0557
where Z(o,θ) is an intractable normalization constant (with
respect to a).
To train the EBM for implicit policy, an InfoNCE-style loss
function is used, which equates to the negative log-likelihood
of Eq 6:
------------------------------------------------------------

tensor([[0.2721, 0.7279]], device='mps:0')

0: 640x512 2 Pictures, 1 Section-header, 2 Tables, 8 Texts, 623.2ms
Speed: 3.1ms preprocess, 623.2ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 512)

Candidate 1
Text:
TABLE II: Behavior Cloning Benchmark (Visual Policy) Performance are reported in the same format as in Tab I. 

Batches: 100%|██████████| 1/1 [00:00<00:00, 60.98it/s]



QUERY:
Which sentence is the title or caption describing the following table?

|  | Lift ph | mh | Can ph | mh | Square ph | mh | Transport ph | mh | ToolHang ph | Push-T ph |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
| LSTM-GMM [29] | 1.00/0.96 | 1.00/0.95 | 1.00/0.88 | 0.98/0.90 | 0.82/0.59 | 0.64/0.38 | 0.88/0.62 | 0.44/0.24 | 0.68/0.49 | 0.69/0.54 |
| IBC [12] | 0.94/0.73 | 0.39/0.05 | 0.08/0.01 | 0.00/0.00 | 0.03/0.00 | 0.00/0.00 | 0.00/0.00 | 0.00/0.00 | 0.00/0.00 | 0.75/0.64 |
| DiffusionPolicy-C | 1.00/1.00 | 1.00/1.00 | 1.00/0.97 | 1.00/0.96 | 0.98/0.92 | 0.98/0.84 | 1.00/0.93 | 0.89/0.69 | 0.95/0.73 | 0.91/0.84 | 

Rank 1 | Score -7.7686
TABLE II: Behavior Cloning Benchmark (Visual Policy) Performance are reported in the same format as in Tab I. LSTM-GMM numbers
were reproduced to get a complete evaluation in addition to the best checkpoint performance reported. Diffusion Policy shows consistent
performance improvement, especially for complex tasks 

Batches: 100%|██████████| 1/1 [00:00<00:00, 17.38it/s]



QUERY:
Which sentence is the title or caption describing the following table?

|  | Simulation Benchmark | Simulation Benchmark | Simulation Benchmark | Simulation Benchmark | Simulation Benchmark | Simulation Benchmark | Simulation Benchmark | Simulation Benchmark |
| --- | --- | --- | --- | --- | --- | --- | --- | --- |
| Lift | 1 | 1 | 7 | 200 | 300 | 400 | Yes | No |
| Can | 1 | 1 | 7 | 200 | 300 | 400 | Yes | No |
| Square | 1 | 1 | 7 | 200 | 300 | 400 | Yes | Yes |
| Transport | 2 | 3 | 14 | 200 | 300 | 700 | Yes | No |
| ToolHang | 1 | 2 | 7 | 200 | 0 | 700 | Yes | Yes |
| Push-T | 1 | 1 | 2 | 200 | 0 | 300 | Yes | Yes |
| BlockPush | 1 | 2 | 2 | 0 | 0 | 350 | No | No |
| Kitchen | 1 | 7 | 9 | 656 | 0 | 280 | No | No |
| Realworld Benchmark | Realworld Benchmark | Realworld Benchmark | Rea 

Rank 1 | Score -6.4910
TABLE II: Behavior Cloning Benchmark (Visual Policy) Performance are reported in the same format as in Tab I. LSTM-GMM numbers
were reproduced to get a complete evalu

Batches: 100%|██████████| 1/1 [00:00<00:00, 68.97it/s]



QUERY:
Which sentence is the title or caption describing the following table?

|  | p1 | BlockPush p2 | p1 | p2 | Kitchen p3 | p4 |
| --- | --- | --- | --- | --- | --- | --- |
| LSTM-GMM [29] | 0.03 | 0.01 | 1.00 | 0.90 | 0.74 | 0.34 |
| IBC [12] | 0.01 | 0.00 | 0.99 | 0.87 | 0.61 | 0.24 |
| BET [42] | 0.96 | 0.71 | 0.99 | 0.93 | 0.71 | 0.44 |
| DiffusionPolicy-C | 0.36 | 0.11 | 1.00 | 1.00 | 1.00 | 0.99 |
| DiffusionPolicy-T | 0.99 | 0.94 | 1.00 | 0.99 | 0.99 | 0.96 | 

Rank 1 | Score -8.3419
TABLE IV: Multi-Stage Tasks (State Observation). For PushBlock,
px is the frequency of pushing x blocks into the targets. For Kitchen,
px is the frequency of interacting with x or more objects (e.g. bottom
burner). Diffusion Policy performs better, especially for difficult
metrics such as p2 for Block Pushing and p4 for Kitchen, as
demonstrated by our results.
------------------------------------------------------------
Rank 2 | Score -8.4842
end-effector (blue)s. Variation is added by random in

Batches: 100%|██████████| 1/1 [00:00<00:00, 69.17it/s]



QUERY:
Which sentence is the title or caption describing the following table?

|  | Demo | pos | vel | pos | vel | T-E2E | ImgNet | R3M |
| --- | --- | --- | --- | --- | --- | --- | --- | --- |
| IoU | 0.84 | 0.14 | 0.19 | 0.24 | 0.25 | 0.53 | 0.24 | 0.66 |
| Succ % | 1.00 | 0.00 | 0.00 | 0.20 | 0.10 | 0.65 | 0.15 | 0.80 | 

Rank 1 | Score -9.5446
①
------------------------------------------------------------
Rank 2 | Score -9.5446
②
------------------------------------------------------------
Rank 3 | Score -9.5747
TABLE V: Realworld Push-T Experiment. a) Hardware setup. b)
Illustration of the task. The robot needs to 1⃝precisely push the T-
shaped block into the target region, and 2⃝move the end-effector to
the end-zone. c) The ground truth end state used to calculate IoU
metrics used in this table. Table: Success is defined by the end-
state IoU greater than the minimum IoU in the demonstration dataset.
Average episode duration presented in seconds. T-E2E stands for end-
to-end tra

Batches: 100%|██████████| 1/1 [00:00<00:00, 21.43it/s]



QUERY:
Which sentence is the title or caption describing the following table?

|  | IoU | Pour Succ | Spread Coverage | Succ |
| --- | --- | --- | --- | --- |
| Human | 0.79 | 1.00 | 0.79 | 1.00 |
| LSTM-GMM Diffusion Policy | 0.06 0.74 | 0.00 0.79 | 0.27 0.77 | 0.00 1.00 | 

Rank 1 | Score -9.6017
Fig. 11: Realworld Sauce Manipulation.
[Left] 6DoF pouring
Task. The robot needs to 1⃝dip the ladle to scoop sauce from the
bowl, 2⃝approach the center of the pizza dough, 3⃝pour sauce, and
4⃝lift the ladle to finish the task. [Right] Periodic spreading Task
The robot needs to 1⃝approach the center of the sauce with a grasped
spoon, 2⃝spread the sauce to cover pizza in a spiral pattern, and 3⃝
lift the spoon to finish the task.
------------------------------------------------------------
Rank 2 | Score -10.0026
3
------------------------------------------------------------
Rank 3 | Score -10.0222
2
------------------------------------------------------------

tensor([[0.0278, 0.9722]], devi

Batches: 100%|██████████| 1/1 [00:00<00:00, 83.14it/s]



QUERY:
Which sentence is the title or caption describing the following table?

| Lift | Pos | 2 | 8 | 10 | 9 | 22 | 8 | 256 | 0.3 | 1e-4 | 1e-3 | 100 | 100 |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
| Can | Pos | 2 | 8 | 10 | 9 | 22 | 8 | 256 | 0.3 | 1e-4 | 1e-3 | 100 | 100 |
| Square | Pos | 2 | 8 | 10 | 9 | 22 | 8 | 256 | 0.3 | 1e-4 | 1e-3 | 100 | 100 |
| Transport | Pos | 2 | 8 | 10 | 9 | 45 | 8 | 256 | 0.3 | 1e-4 | 1e-3 | 100 | 100 |
| ToolHang | Pos | 2 | 8 | 10 | 9 | 22 | 8 | 256 | 0.3 | 1e-4 | 1e-3 | 100 | 100 |
| Push-T | Pos | 2 | 8 | 16 | 9 | 22 | 8 | 256 | 0.01 | 1e-4 | 1e-1 | 100 | 100 |
| Block Push | Vel | 3 | 1 | 5 | 9 | 0 | 8 | 256 | 0.3 | 1e-4 | 1e-3 | 100 | 100 |
| Kitchen | Pos | 4 | 8 | 16 | 80 | 0 | 8 | 768 | 0.1 | 1e-4 | 1e-3 | 10 

Rank 1 | Score -9.2736
TABLE VII: Hyperparameters for Transformer-based Diffusion Policy Ctrl: position or velocity control To: observation horizon Ta: action
horizon Tp: action prediction 

Batches: 100%|██████████| 1/1 [00:00<00:00, 89.57it/s]



QUERY:
Which sentence is the title or caption describing the following table?

| Lift | Pos | 2 | 8 | 16 | 2x84x84 | 2x76x76 | 256 | 22 | 1e-4 | 1e-6 | 100 | 100 |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
| Can | Pos | 2 | 8 | 16 | 2x84x84 | 2x76x76 | 256 | 22 | 1e-4 | 1e-6 | 100 | 100 |
| Square | Pos | 2 | 8 | 16 | 2x84x84 | 2x76x76 | 256 | 22 | 1e-4 | 1e-6 | 100 | 100 |
| Transport | Pos | 2 | 8 | 16 | 4x84x85 | 4x76x76 | 264 | 45 | 1e-4 | 1e-6 | 100 | 100 |
| ToolHang | Pos | 2 | 8 | 16 | 2x240x240 | 2x216x216 | 256 | 22 | 1e-4 | 1e-6 | 100 | 100 |
| Push-T | Pos | 2 | 8 | 16 | 1x96x96 | 1x84x84 | 256 | 22 | 1e-4 | 1e-6 | 100 | 100 |
| Block Push | Pos | 3 | 1 | 12 | N/A | N/A | 256 | 0 | 1e-4 | 1e-6 | 100 | 100 |
| Kitchen | Pos | 2 | 8 | 16 | N/A | N/A 

Rank 1 | Score -9.0828
TABLE VI: Hyperparameters for CNN-based Diffusion Policy Ctrl: position or velocity control To: observation horizon Ta: action horizon
Tp: action prediction horizon I

In [8]:
import torch
from transformers import AutoTokenizer, RobertaForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
model = RobertaForSequenceClassification.from_pretrained("FacebookAI/roberta-base")

# To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
model = RobertaForSequenceClassification.from_pretrained("FacebookAI/roberta-base", num_labels=2)
# model.classifier.load_state_dict(torch.load("data/roberta_figure_diff.pt"))

for name, param in model.named_parameters():
    if "classifier" not in name:
        param.requires_grad_(False)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
import torch
from torch.utils.data import random_split, Dataset, DataLoader
import pandas as pd

class ClassifDataset(Dataset):
    def __init__(self):
        super().__init__()
        self.data = pd.read_csv("dataset_example.csv")

    def __getitem__(self, index):
        return self.data.loc[index]["text"], self.data.loc[index]["label"]
    
    def __len__(self):
        return len(self.data)

torch.manual_seed(42)
device = "cuda" if torch.cuda.is_available() else "mps" if torch.mps.is_available() else "cpu"

dataset = ClassifDataset()
train_data, test_data = random_split(dataset, [0.8, 0.2])

train = DataLoader(train_data, batch_size=32, shuffle=True)
test = DataLoader(test_data, batch_size=32, shuffle=False)

print(train, test)

for X, y in train:
    print(X, y)
    break

optim = torch.optim.AdamW(model.parameters(), lr=5e-4)

<torch.utils.data.dataloader.DataLoader object at 0x1113ec090> <torch.utils.data.dataloader.DataLoader object at 0x358f61990>
('REFERENCES', 'The following table presents research and development expenditures.', 'The table below shows cash and cash equivalents at period end.', 'Tourists gather near the Colosseum during peak summer travel season.', 'He continues to work part-time as an anesthesiologist while overseeing daily operations.', '15794', 'The clinic’s expansion followed months of regulatory review and internal planning. Management opted to focus on a narrow range of procedures to maximize efficiency. The model has since drawn interest from other practitioners considering similar ventures.', 'data not shown', 'Emissions By Industry', 'We have entered into operating and finance lease agreement primarily for data centers, land, and offices throughout the World', 'Figure VIII. Year-over-year growth of subscription revenue from 2018 to 2025.', 'The company’s decision to delay its p

In [10]:
import torch.nn.functional as F


model.to(device)
epochs = 90

ce = torch.nn.CrossEntropyLoss()

for i in range(epochs):
    model.train()

    epoch_loss = 0
    epoch_test_loss = 0
    for X, y in train:
        tokens = tokenizer(X, return_tensors="pt", padding=True).to(device)
        y = y.to(device)
        ypred = model(**tokens).logits
        
        loss = ce(ypred, y)

        optim.zero_grad()
        loss.backward()
        optim.step()

        epoch_loss += loss.item()

    model.eval()
    correct = 0
    total_dt = 0
    with torch.no_grad():
        for X_test, y_test in test:
            tokens = tokenizer(X_test, return_tensors="pt", padding=True).to(device)
            y_test = y_test.to(device)
            ypred_test = model(**tokens).logits

            loss = ce(ypred_test, y_test)
            epoch_test_loss += loss.item()
            correct += torch.sum((ypred_test.argmax(dim=1) == y_test).int())
            total_dt += len(X_test)

    print(f"loss = {epoch_loss / len(train)}", f"test loss = {epoch_test_loss / len(test)} accuracy = {correct / total_dt}")


loss = 0.7019778192043304 test loss = 0.6649491190910339 accuracy = 0.5476190447807312
loss = 0.6758884092171987 test loss = 0.6468484997749329 accuracy = 0.6904761791229248
loss = 0.6820089817047119 test loss = 0.6601446866989136 accuracy = 0.5476190447807312
loss = 0.6638355751832327 test loss = 0.6160564124584198 accuracy = 0.8095238208770752
loss = 0.6088594496250153 test loss = 0.6102079451084137 accuracy = 0.7142857313156128
loss = 0.6474902828534445 test loss = 0.601288765668869 accuracy = 0.738095223903656
loss = 0.6061618129412333 test loss = 0.5861169993877411 accuracy = 0.8333333134651184
loss = 0.5856964687506357 test loss = 0.5737299025058746 accuracy = 0.8333333134651184
loss = 0.5617215236028036 test loss = 0.577080488204956 accuracy = 0.8095238208770752
loss = 0.5263089587291082 test loss = 0.551364004611969 accuracy = 0.8333333134651184
loss = 0.5471353083848953 test loss = 0.5369485020637512 accuracy = 0.8333333134651184
loss = 0.5189899106820425 test loss = 0.5245751

In [11]:
model.to(device)
model.eval()
tokens = tokenizer([
    "wi = 1"
], return_tensors="pt", padding=True).to(device)

logits = model(**tokens).logits
torch.softmax(logits, dim=-1)

tensor([[0.5881, 0.4119]], device='mps:0', grad_fn=<SoftmaxBackward0>)

In [None]:
torch.save(model.classifier.state_dict(), "data/roberta_figure_diff.pt")

In [13]:
import torch
from transformers import AutoTokenizer, RobertaForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
model = RobertaForSequenceClassification.from_pretrained("FacebookAI/roberta-base")

# inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")

# with torch.no_grad():
#     logits = model(**inputs).logits

# predicted_class_id = logits.argmax().item()
# model.config.id2label[predicted_class_id]

# To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
model = RobertaForSequenceClassification.from_pretrained("FacebookAI/roberta-base", num_labels=2)
model.classifier.load_state_dict(torch.load("data/roberta_figure_diff.pt"))



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [15]:
model.to(device)
model.eval()
tokens = tokenizer([
    "Financial Asset Loss per Annum"
], return_tensors="pt", padding=True).to(device)

logits = model(**tokens).logits
torch.softmax(logits, dim=-1)

tensor([[0.3877, 0.6123]], device='mps:0', grad_fn=<SoftmaxBackward0>)