In [1]:
import sys
import os
import pickle
from dataclasses import dataclass, asdict
import json
import numpy as np
from typing import Dict, List, Any, Optional, Tuple
import logging
from pathlib import Path
from tqdm import tqdm
import hashlib
# sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath("/Users/ymxu/Workspace/MuDocU/DOMAGENT/tests/test_subtree_emb.py"))))
from subtree_embedding import SubtreeEmbedder
# from src.core.dom_processor_v2 import TokenAwareDOMProcessor, DOMNode
from json_to_dom_processor import DOMNode
@dataclass
class EmbeddingResult:
    node_id: str
    embedding: np.ndarray
    metadata: Dict[str, Any]
    structure_info: Dict[str, Any]
    parent_id: Optional[str] = None
    children_ids: List[str] = None
    embedding_time: float = 0.0

Qwen2VL not available. Please install transformers and torch to use Qwen2VL.


In [48]:
config = {
        'text_model': 'openai/clip-vit-base-patch32',
        'image_model': 'openai/clip-vit-base-patch32',
        'table_model': 'openai/clip-vit-base-patch32',
        'aggregator': 'attention',
        'embedding_dim': 512,
        'pruning': {
            'tag_blacklist': ['script', 'style'],
            'max_tokens_per_chunk': 2000
        }
    }

In [49]:
# dom_processor = TokenAwareDOMProcessor(config)

In [50]:
import json
def _collect_all_nodes(root_node) -> List:
    """收集DOM树中的所有节点 - 兼容dict和DOMNode对象"""
    nodes = []

    def dfs_collect(node):
        # 处理字典格式的节点（JSON数据）
        if isinstance(node, dict):
            # 只处理有全局ID的节点
            if node.get('metadata', {}).get('global_id'):
                nodes.append(node)

            # 递归处理子节点
            children = node.get('children', [])
            for child in children:
                dfs_collect(child)

        # 处理DOMNode对象
        elif hasattr(node, 'metadata') and node.metadata.get('global_id'):
            nodes.append(node)

            # 递归处理子节点
            if hasattr(node, 'children'):
                for child in node.children:
                    dfs_collect(child)

    dfs_collect(root_node)
    return nodes

# html_file_path = './../data/Demo/8e7c4cb542ad160f80fb3d795ada35d8.html'
# tree, chunks, process_logs = dom_processor.process_html_file(html_file_path)
# tree = build_global_tree_from_html(html_file_path, enable_table_grouping=True)

# 加载JSON格式的DOM树
with open("./data/dom/MMLongBench-Doc/welcome-to-nus.json", "r") as f:
    tree = json.load(f)

nodes = _collect_all_nodes(root_node=tree)
print(f"找到 {len(nodes)} 个节点")

# 显示前几个节点的信息
for i, node in enumerate(nodes[:3]):
    if isinstance(node, dict):
        node_id = node.get('metadata', {}).get('global_id', 'unknown')
        tag = node.get('tag', 'unknown')
        element_type = node.get('metadata', {}).get('element_type', '')
        print(f"节点 {i}: {tag}({element_type}) - ID: {node_id}")
    else:
        print(f"节点 {i}: {node}")


找到 754 个节点
节点 0: document() - ID: doc_welcome-to-nus_54388
节点 1: p(paragraph) - ID: doc_welcome-to-nus_54389
节点 2: figure(figure) - ID: doc_welcome-to-nus_54390


In [51]:
nodes

[{'tag': 'document',
  'metadata': {'depth': 0,
   'page_id': None,
   'global_id': 'doc_welcome-to-nus_54388',
   'node_type': None,
   'document_name': 'welcome-to-nus.pdf',
   'page_count': 24,
   'source_type': 'pdf_json'},
  'children': [{'tag': 'p',
    'text': 'Come',
    'metadata': {'depth': 1,
     'page_id': None,
     'global_id': 'doc_welcome-to-nus_54389',
     'node_type': None,
     'original_index': 0,
     'parent_chapter': -1,
     'is_chapter_title': False,
     'element_type': 'paragraph',
     'page_number': 0,
     'bbox': [603.5, 21.0, 863.0, 134.5],
     'heading_level': None},
    'data-page': '0'},
   {'tag': 'figure',
    'text': 'NUS National University of Singapore',
    'metadata': {'depth': 1,
     'page_id': None,
     'global_id': 'doc_welcome-to-nus_54390',
     'node_type': None,
     'original_index': 1,
     'parent_chapter': -1,
     'is_chapter_title': False,
     'element_type': 'figure',
     'page_number': 0,
     'bbox': [22.5, 334.0, 111.5, 

In [52]:
print("版本信息:")
import torch, transformers
print(f"PyTorch: {torch.__version__}")
print(f"Transformers: {transformers.__version__}")

embedder = SubtreeEmbedder(
    text_model="openai/clip-vit-base-patch32",      # 使用CLIP处理文本
    image_model="openai/clip-vit-base-patch32",     # 使用CLIP处理图像  
    table_model="openai/clip-vit-base-patch32",     # 使用CLIP处理表格
    aggregator="attention",
    embedding_dim=512,  # CLIP base的维度
    data_dir="./data/dom/MMLongBench-Doc"
)

print("✅ SubtreeEmbedder初始化成功（使用CLIP模型）")

版本信息:
PyTorch: 2.0.1
Transformers: 4.28.0
✅ SubtreeEmbedder初始化成功（使用CLIP模型）


# Table Embedding Test

In [53]:
# 查找包含table子节点的节点
for i in range(len(nodes)):
    # 对于字典格式的节点
    if isinstance(nodes[i], dict):
        children = nodes[i].get('children', [])
        if children:
            if children[0].get('tag') == 'table':
                print(i, nodes[i].get('tag'))
    # 对于DOMNode对象
    elif hasattr(nodes[i], 'children'):
        if nodes[i].children:
            if nodes[i].children[0].tag == 'table':
                print(i, nodes[i].tag)

# 显示第一个找到的节点信息
if len(nodes) > 0:
    node = nodes[0]
    if isinstance(node, dict):
        print(f"\n节点示例 (dict): {node.get('tag')} - ID: {node.get('metadata', {}).get('global_id')}")
        print(f"子节点数量: {len(node.get('children', []))}")
    else:
        print(f"\n节点示例 (DOMNode): {node.tag} - ID: {node.metadata.get('global_id')}")
        print(f"子节点数量: {len(node.children) if hasattr(node, 'children') else 0}")

12 h1
456 h3
468 h2
480 h2
492 h4
504 h6
520 h2

节点示例 (dict): document - ID: doc_welcome-to-nus_54388
子节点数量: 12


In [54]:
nodes[456]

{'tag': 'h3',
 'text': 'Above Central Forum',
 'metadata': {'depth': 5,
  'page_id': None,
  'global_id': 'doc_welcome-to-nus_54844',
  'node_type': None,
  'original_index': 458,
  'parent_chapter': 457,
  'is_chapter_title': True,
  'element_type': 'paragraph',
  'page_number': 19,
  'bbox': [471.0, 361.5, 559.0, 372.0],
  'heading_level': 3},
 'children': [{'tag': 'table',
   'metadata': {'depth': 6,
    'page_id': None,
    'global_id': 'doc_welcome-to-nus_54845',
    'node_type': None,
    'original_index': 459,
    'parent_chapter': 458,
    'is_chapter_title': False,
    'element_type': 'table',
    'page_number': 19,
    'bbox': [471.0, 375.0, 585.5, 411.0],
    'table_image_extracted': True},
   'children': [{'tag': 'tr',
     'metadata': {'depth': 7,
      'page_id': None,
      'global_id': 'doc_welcome-to-nus_54846',
      'node_type': None},
     'children': [{'tag': 'td',
       'text': 'Mon - Fri:',
       'metadata': {'depth': 8,
        'page_id': None,
        'global

In [55]:
from subtree_embedding.encoders.table_encoder import TableEncoder

table_encoder = TableEncoder()

table_encoder.encode(nodes[12])

array([ 1.36678740e-01, -1.97675720e-01,  2.29415759e-01, -1.04295313e-01,
        1.11870483e-01,  2.77452379e-01, -1.08328596e-01, -3.63796949e-03,
       -7.39143789e-01,  1.86703846e-01, -4.06990111e-01,  7.04161078e-02,
        1.08785309e-01,  2.61808395e-01,  1.11700848e-01, -2.73666948e-01,
       -1.41534790e-01,  1.87388748e-01, -4.19057101e-01,  1.44080848e-01,
        4.34316516e-01,  1.27006322e-01, -1.48881316e-01, -2.27879718e-01,
       -9.20555890e-02, -9.03861970e-02,  5.01321793e-01, -3.11138481e-01,
       -2.47929335e-01, -6.93440586e-02,  1.71733662e-01, -3.27427089e-02,
       -1.84049651e-01,  1.85276285e-01, -1.26191571e-01, -1.32051751e-01,
       -2.78434306e-01,  1.35189474e-01, -4.21089381e-02, -5.33642545e-02,
        4.50838923e-01,  8.50542262e-02,  1.82767645e-01,  1.98819324e-01,
       -1.65465638e-01,  2.44858727e-01,  4.56111759e-01, -2.68852860e-01,
        3.78298581e-01,  1.42466560e-01, -1.45594761e-01, -2.73392737e-01,
        2.30517969e-01, -

**Table A: Residential Development Capacity and the Impact of Rezonings, by Borough (2003-2007)**

|  | Residential Capacity, by Sq Ft (2003) | %Land Area Rezoned | Change in Residential Capacity, by Sq Ft (as of 2007) | %Capacity Change |
|---|---|---|---|---|
| The Bronx | 980,000,000 | 18.4% | 290,000 | 0.0% |
| Brooklyn | 1,606,000,000 | 13.9% | 19,950,000 | 1.2% |
| Manhattan | 1,466,000,000 | 5.3% | 34,150,000 | 2.3% |
| Queens | 1,342,000,000 | 19.0% | 37,850,000 | 2.8% |
| Staten Island | 435,000,000 | 22.9% | 5,980,000 | 1.4% |
| NYC | 5,829,000,000 | 17.7% | 98,220,000 | 1.7% |

**Table B: Median Income for Census Tracts Where Rezoned Lots Were Located (2007 $)**

| All Tracts in NYC | Tracts with Upzoned Lots* | Tracts with Downzoned Lots* | Tracts with Contextual-only Rezoned Lots* |
|---|---|---|---|
| $53,724 | $44,444 | $51,195 | $63,550 |

**Table C: Median Homeownership Rate for Census Tracts Where Rezoned Lots Were Located (2000)**

| All Tracts in NYC | Tracts with Upzoned Lots* | Tracts with Downzoned Lots* | Tracts with Contextual-only Rezoned Lots* |
|---|---|---|---|
| 44.8% | 30.8% | 35.7% | 63.5% |

**Table D: Percent of Rezoned Lots Within 1/2 a Mile of a Rail Station Entrance (2007)**

| All NYC Lots | Upzoned Lots | Downzoned Lots | Contextual-only Rezoned Lots |
|---|---|---|---|
| 49.5% | 73.4% | 58.9% | 29.0% |

# Image Embedding Test

In [56]:
for i in range(len(nodes)):
    if nodes[i]['tag'] == 'figure':
        print(i, nodes[i]['tag'])

2 figure
4 figure
10 figure
31 figure
35 figure
37 figure
41 figure
50 figure
53 figure
55 figure
63 figure
168 figure
170 figure
177 figure
196 figure
198 figure
200 figure
202 figure
205 figure
216 figure
218 figure
222 figure
226 figure
228 figure
230 figure
232 figure
234 figure
240 figure
259 figure
266 figure
271 figure
275 figure
290 figure
303 figure
320 figure
323 figure
329 figure
337 figure
339 figure
351 figure
353 figure
359 figure
363 figure
368 figure
372 figure
377 figure
381 figure
383 figure
389 figure
392 figure
399 figure
406 figure
414 figure
417 figure
419 figure
450 figure
452 figure
625 figure
633 figure
635 figure
672 figure
714 figure
716 figure
734 figure
737 figure
740 figure
746 figure
748 figure
751 figure


In [57]:
nodes[10]

{'tag': 'figure',
 'metadata': {'depth': 2,
  'page_id': None,
  'global_id': 'doc_welcome-to-nus_54398',
  'node_type': None,
  'original_index': 7,
  'parent_chapter': 5,
  'is_chapter_title': False,
  'element_type': 'figure',
  'page_number': 1,
  'bbox': [453.5, 15.0, 863.5, 493.5],
  'image_extracted': True,
  'description_method': 'full_page_region',
  'ai_description': '**Summary**: The target region contains a large image of a building with a sign that reads "Welcome to NUS" and a contents page listing topics and page numbers.\n\n**Content Details**:\n- **Image**: A large, colorful photograph of a building with a prominent sign that says "Welcome to NUS" and includes the hashtag "#NUSBeyond".\n- **Contents Page**: A list of topics and corresponding page numbers, organized into sections such as "Starting Out", "Beyond the Classroom", and "Explore Campus Life".\n\n**Visual Features**:\n- **Colors**: Predominantly blue, green, and white, with a mix of text and graphics.\n- **Layo

In [62]:
from subtree_embedding.encoders.image_encoder import ImageEncoder

image_encoder = ImageEncoder(target_dim=512)

image_encoder.encode(nodes[10])

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


array([ 0.3235734 ,  0.51576835, -0.41412336,  0.22816797, -0.5063931 ,
       -0.4346049 , -0.20765573, -0.24436751,  0.48946714, -0.21604484,
       -0.07091527,  0.6570646 ,  0.479587  ,  0.66842496, -0.25481766,
       -0.6063508 ,  0.28039005,  0.95903325, -0.52624774,  0.5016964 ,
        0.41765702, -0.32217687, -0.30804425,  0.16458586, -0.07625186,
        0.00444254,  0.40553138, -0.0676101 ,  0.5595541 , -0.8611907 ,
       -0.07200384, -0.4215448 , -0.8455942 ,  0.6809541 ,  0.6608311 ,
        0.18244106,  0.30533355, -0.10401656,  0.47927403,  0.28986022,
        0.13490957,  1.0650345 , -0.09378101,  0.532217  ,  0.7327664 ,
        0.05364907, -0.05372482,  0.02380083, -0.13743807, -0.359785  ,
       -0.3853632 , -0.07632671,  1.1851233 , -0.28001085, -0.4189847 ,
       -0.39503473,  0.24590555, -0.49609846, -0.44704682, -0.3833906 ,
       -0.7428488 , -0.31959075, -0.4184634 , -0.01202947, -0.7862701 ,
       -0.38804942, -0.28940716, -0.50109035,  0.5675326 , -0.77

In [63]:
class QuestionEmbedder:
    """专门用于问题嵌入的类"""

    def __init__(self, model_name="openai/clip-vit-base-patch32", embedding_dim=512):
        """
        初始化问题嵌入器
        
        Args:
            model_name: 使用的模型名称
            embedding_dim: 嵌入维度
        """
        self.model_name = model_name
        self.embedding_dim = embedding_dim

        # 导入CLIP模型
        from transformers import CLIPModel, CLIPProcessor
        import torch

        print(f"加载CLIP模型: {model_name}")
        self.clip_model = CLIPModel.from_pretrained(model_name)
        self.clip_processor = CLIPProcessor.from_pretrained(model_name)

        # 设置为评估模式
        self.clip_model.eval()

        print(f"✅ QuestionEmbedder初始化成功，维度: {embedding_dim}")

    def embed_text(self, text):
        """
        对文本进行嵌入
        
        Args:
            text: 输入文本
            
        Returns:
            numpy.ndarray: 嵌入向量
        """
        import torch
        import numpy as np

        try:
            # 使用CLIP处理器处理文本
            inputs = self.clip_processor(text=[text], return_tensors="pt", padding=True, truncation=True)

            # 获取文本嵌入
            with torch.no_grad():
                text_features = self.clip_model.get_text_features(**inputs)

            # 转换为numpy数组并归一化
            embedding = text_features.squeeze().numpy()
            embedding = embedding / np.linalg.norm(embedding)  # L2归一化

            return embedding

        except Exception as e:
            print(f"文本嵌入失败: {e}")
            return None

    def embed_questions(self, questions):
        """
        批量嵌入多个问题
        
        Args:
            questions: 问题列表
            
        Returns:
            dict: {问题: 嵌入向量}
        """
        results = {}

        print(f"批量嵌入 {len(questions)} 个问题...")

        for i, question in enumerate(questions):
            print(f"处理问题 {i+1}/{len(questions)}: {question[:50]}...")

            embedding = self.embed_text(question)
            if embedding is not None:
                results[question] = embedding
                print(f"  ✅ 成功，维度: {embedding.shape}")
            else:
                print(f"  ❌ 失败")

        print(f"✅ 成功嵌入 {len(results)}/{len(questions)} 个问题")
        return results

    def similarity(self, query_embedding, doc_embedding):
        """
        计算两个嵌入向量的余弦相似度
        
        Args:
            query_embedding: 查询嵌入
            doc_embedding: 文档嵌入
            
        Returns:
            float: 相似度分数 (0-1)
        """
        import numpy as np

        # 确保向量是归一化的
        query_norm = query_embedding / np.linalg.norm(query_embedding)
        doc_norm = doc_embedding / np.linalg.norm(doc_embedding)

        # 计算余弦相似度
        similarity = np.dot(query_norm, doc_norm)
        return float(similarity)
 
 
question_embedder = QuestionEmbedder(
    model_name="openai/clip-vit-base-patch32",
    embedding_dim=512
)

def embed(text):
    return question_embedder.embed_text(text)

  # 使用
q = embed("welcome NUS")

question_embedder.similarity(q, image_encoder.encode(nodes[10]))

加载CLIP模型: openai/clip-vit-base-patch32
✅ QuestionEmbedder初始化成功，维度: 512


0.04266424849629402

In [83]:
# 1. 测试文本自相似度
q1 = embed("welcome to NUS")
q2 = embed("welcome to NUS")
print(f"文本自相似度: {question_embedder.similarity(q1, q2)}")  # 应该是1.0

# 2. 创建512维的问题编码器来匹配图像编码器
question_embedder_512 = QuestionEmbedder(
    model_name="openai/clip-vit-base-patch32",
    embedding_dim=512  # 使用原始维度
)

def embed_512(text):
    return question_embedder_512.embed_text(text)

# 3. 用匹配的维度测试
q_512 = embed_512("welcome to NUS")
img_emb = image_encoder.encode(nodes[10])

print(f"Query 512维: {q_512.shape}")
print(f"Image 512维: {img_emb.shape}")
print(f"正确维度相似度: {question_embedder_512.similarity(q_512, img_emb)}")

# 4. 测试AI描述的相似度
ai_desc = nodes[10]['metadata']['ai_description']
q_desc = embed_512(ai_desc)
print(f"AI描述相似度: {question_embedder_512.similarity(q_desc, img_emb)}")

文本自相似度: 1.0
加载CLIP模型: openai/clip-vit-base-patch32
✅ QuestionEmbedder初始化成功，维度: 512
Query 512维: (512,)
Image 512维: (512,)
正确维度相似度: 0.044546037912368774
AI描述相似度: 0.005650727078318596


In [85]:
ai_desc = nodes[10].get('metadata', {}).get('ai_description', '')
print(ai_desc)
q_ai = embed_512(ai_desc)
print(f"完整AI描述相似度: {question_embedder_512.similarity(q_512, q_ai)}")

**Summary**: The target region contains a large image of a building with a sign that reads "Welcome to NUS" and a contents page listing topics and page numbers.

**Content Details**:
- **Image**: A large, colorful photograph of a building with a prominent sign that says "Welcome to NUS" and includes the hashtag "#NUSBeyond".
- **Contents Page**: A list of topics and corresponding page numbers, organized into sections such as "Starting Out", "Beyond the Classroom", and "Explore Campus Life".

**Visual Features**:
- **Colors**: Predominantly blue, green, and white, with a mix of text and graphics.
- **Layout**: The contents page is structured with clear headings and a grid-like layout, making it easy to navigate.
- **Text**: The text is in a readable font, with headings in a larger, bold font and subheadings in a smaller font.

**Context Integration**:
- The image and contents page are part of a student guide, likely for the National University of Singapore (NUS), given the "Welcome to N

In [68]:
visual_only = image_encoder._encode_visual_content(nodes[10])
print(f"纯视觉特征相似度: {question_embedder_512.similarity(q_512, visual_only)}")

纯视觉特征相似度: 0.048249751329422


In [75]:
q_combined = embed_512("a large image of a building with a sign that reads \Welcome to NUS\ ")
print(f"组合查询相似度: {question_embedder_512.similarity(q_combined, img_emb)}")

组合查询相似度: 0.027539867907762527


In [86]:
def debug_multimodal_similarity():
    img_node = nodes[10]

    # 1. 分析图像节点的实际内容
    print("=== 图像节点分析 ===")
    ai_desc = img_node.get('metadata', {}).get('ai_description', '')
    print(f"AI描述: {ai_desc[:200]}...")

    # 2. 分别测试各个组件
    print("\n=== 组件分析 ===")

    # 图像的视觉特征
    visual_feat = image_encoder._encode_visual_content(img_node)
    print(f"视觉特征形状: {visual_feat.shape}")
    print(f"视觉特征非零: {np.count_nonzero(visual_feat)}")

    # 图像的文本特征  
    text_feat = image_encoder._encode_text_context(img_node)
    print(f"文本特征形状: {text_feat.shape}")
    print(f"文本特征非零: {np.count_nonzero(text_feat)}")

    # 融合后的特征
    fused_feat = image_encoder._fuse_features(visual_feat, text_feat, img_node)
    print(f"融合特征形状: {fused_feat.shape}")

    # 3. 测试不同的查询方式
    print("\n=== 查询测试 ===")

    # 原始查询
    q1 = embed_512("welcome to NUS")
    sim1 = question_embedder_512.similarity(q1, text_feat)  # 只与文本部分比较
    print(f"与文本特征相似度: {sim1:.3f}")

    # 直接用AI描述文本测试
    q2 = embed_512("Welcome to NUS building with sign and contents page")
    sim2 = question_embedder_512.similarity(q2, text_feat)
    print(f"扩展描述相似度: {sim2:.3f}")

    # 测试AI描述的自相似度
    ai_emb = embed_512(ai_desc)
    sim3 = question_embedder_512.similarity(ai_emb, text_feat)
    print(f"AI描述自相似度: {sim3:.3f}")

    # 4. 检查image_encoder的文本编码逻辑
    print("\n=== 文本编码调试 ===")
    query_node = {
        'tag': 'query',
        'text': "welcome to NUS",
        'metadata': {
            'ai_description': "welcome to NUS",
            'element_type': 'query'
        }
    }

    query_text_feat = image_encoder._encode_text_context(query_node)
    print(f"查询文本特征非零: {np.count_nonzero(query_text_feat)}")

    # 直接相似度
    sim4 = question_embedder_512.similarity(query_text_feat, text_feat)
    print(f"查询文本特征相似度: {sim4:.3f}")

debug_multimodal_similarity()

=== 图像节点分析 ===
AI描述: **Summary**: The target region contains a large image of a building with a sign that reads "Welcome to NUS" and a contents page listing topics and page numbers.

**Content Details**:
- **Image**: A la...

=== 组件分析 ===
视觉特征形状: (512,)
视觉特征非零: 512
文本特征形状: (512,)
文本特征非零: 512
融合特征形状: (512,)

=== 查询测试 ===
与文本特征相似度: 0.000
扩展描述相似度: 0.019
AI描述自相似度: 0.010

=== 文本编码调试 ===
查询文本特征非零: 512
查询文本特征相似度: 0.653


In [76]:
def embed_query_multimodal(text):
    # 将查询文本也处理成类似image_encoder的格式
    query_node = {
        'tag': 'query',
        'text': text,
        'metadata': {
            'ai_description': text,  # 把查询当作AI描述
            'element_type': 'query'
        }
    }

    # 使用image_encoder的文本编码部分
    text_features = image_encoder._encode_text_context(query_node)
    return text_features

# 测试
q_multimodal = embed_query_multimodal("a large image of a building with a sign that reads \Welcome to NUS\ ")
sim = question_embedder_512.similarity(q_multimodal, img_emb)
print(f"多模态查询相似度: {sim}")

多模态查询相似度: 0.2818060517311096


# Parent node

In [60]:
nodes[49].to_json_dict()

{'tag': 'figure',
 'metadata': {'depth': 2,
  'page_id': '4',
  'global_id': 'node_2480',
  'node_type': None},
 'children': [{'tag': 'img',
   'metadata': {'depth': 3,
    'page_id': '4',
    'global_id': 'node_2481',
    'node_type': 'image'},
   'src': '8e7c4cb542ad160f80fb3d795ada35d8_with_image_refs_artifacts/image_000008_8133ff07a71d435f8c68b19de73ce3839fdbd04eddf043e65935d2d1a10d880a.png'}]}

In [55]:
embedding_result = embedder.encode_subtree(nodes[54], max_depth=3)

Encoding internal node - type: list, children: 3
Encoding leaf node - type: list
Encoding leaf node - type: list
Encoding leaf node - type: list


In [65]:
embedding_result.keys()
embedding_result

{'embedding': array([-7.09770247e-03, -1.31934620e-02, -1.55713968e-02, -1.99010447e-02,
        -2.04501674e-04,  1.88648067e-02,  1.67484917e-02, -9.70853306e-03,
         1.44703127e-03, -2.65601687e-02, -4.41717282e-02, -9.86607466e-03,
         1.95105635e-02, -2.29470897e-03, -1.89260952e-02, -1.54101476e-02,
         1.28739709e-02, -2.00524926e-04,  3.46677890e-03,  2.38932781e-02,
         1.34883262e-02,  3.53751006e-03,  8.07017647e-03,  3.73703940e-03,
         1.26829315e-02,  1.07490346e-02,  2.95155719e-02, -4.19999510e-02,
        -3.52409994e-03,  6.44208863e-03,  3.63410115e-02,  2.45656222e-02,
         3.73080522e-02, -3.97562124e-02,  8.55441578e-03,  1.72785148e-02,
        -5.64225670e-03,  1.35858171e-03,  1.52265327e-02, -3.10858954e-02,
         1.23050045e-02, -3.34948674e-03, -3.32341902e-03, -1.40123442e-02,
         5.84061816e-03,  1.40357893e-02,  1.35233290e-02,  1.74291078e-02,
        -9.00099333e-03, -3.13648023e-04,  2.90256217e-02, -2.94006579e-02,

In [9]:
with open("./../embeddings/embeddings.pkl", 'rb') as f:
    embeddings = pickle.load(f)

In [10]:
DOMNode(
    tag='p', 
    text="1  The 'one-fifth' calculation excludes the City's park land. See press release, celebrating the 100th rezoning for more detail: http://nyc.gov/html/dcp/html/about/pr102809.shtml.", 
    attributes={}, 
    children=[
        DOMNode(
            tag='text', 
            text="1  The 'one-fifth' calculation excludes the City's park land. See press release, celebrating the 100th rezoning for more detail: http://nyc.gov/html/dcp/html/about/pr102809.shtml.", 
            attributes={}, 
            children=[], 
            position={'order': 0}, 
            node_type='text', 
            metadata={'page_number': '1', 'global_id': 'text_9'}
            )
            ], 
    position={'order': 17},
    node_type='text', 
    metadata={'page_number': '1', 'global_id': 'text_10'}
    ), 

DOMNode(
    tag='text', 
    text="1  The 'one-fifth' calculation excludes the City's park land. See press release, celebrating the 100th rezoning for more detail: http://nyc.gov/html/dcp/html/about/pr102809.shtml.", 
    attributes={}, 
    children=[], 
    position={'order': 0}, 
    node_type='text', 
    metadata={'page_number': '1', 'global_id': 'text_9'})

DOMNode(tag='text', text="1  The 'one-fifth' calculation excludes the City's park land. See press release, celebrating the 100th rezoning for more detail: http://nyc.gov/html/dcp/html/about/pr102809.shtml.", attributes={}, children=[], position={'order': 0}, node_type='text', metadata={'page_number': '1', 'global_id': 'text_9'})

In [10]:
import sys
import os
import pickle
from dataclasses import dataclass, asdict
import json
import numpy as np
from typing import Dict, List, Any, Optional, Tuple
import logging
from pathlib import Path
from tqdm import tqdm
import hashlib
# 添加项目根目录到路径
current_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in globals() else os.getcwd()
project_root = os.path.dirname(current_dir)  # 上一级目录就是项目根目录
sys.path.append(project_root)
from subtree_embedding import SubtreeEmbedder
from src.core.dom_processor_v2 import TokenAwareDOMProcessor, DOMNode
# from dom_tree.builder import build_tree_json_object, build_global_tree_from_html
# from dom_tree.node import DOMNode

@dataclass
class EmbeddingResult:
    """嵌入结果的数据结构"""
    node_id: str
    embedding: np.ndarray
    metadata: Dict[str, Any]
    structure_info: Dict[str, Any]
    parent_id: Optional[str] = None
    children_ids: List[str] = None
    embedding_time: float = 0.0

config = {
        'text_model': 'BAAI/bge-large-en-v1.5',
        'image_model': 'openai/clip-vit-base-patch32',
        'table_model': 'microsoft/tapex-base',
        'aggregator': 'attention',
        'embedding_dim': 768,
        'pruning': {
            'tag_blacklist': ['script', 'style'],
            'max_tokens_per_chunk': 2000
        }
    }

dom_processor = TokenAwareDOMProcessor(config)

In [None]:

def _collect_all_nodes(root_node: DOMNode) -> List[DOMNode]:
    """收集DOM树中的所有节点"""
    nodes = []
        
    def dfs_collect(node: DOMNode):
        # 只处理有全局ID的节点
        if hasattr(node, 'metadata') and node.metadata.get('global_id'):
                nodes.append(node)
            
        # 递归处理子节点
        if hasattr(node, 'children'):
            for child in node.children:
                dfs_collect(child)
        
    dfs_collect(root_node)
    return nodes

html_file_path = './testdata/8e7c4cb542ad160f80fb3d795ada35d8.html'
tree, chunks, process_logs = dom_processor.process_html_file(html_file_path)
# tree = build_global_tree_from_html(html_file_path)
        
nodes = _collect_all_nodes(root_node = tree)

tree


🔍 发现表格标题: 行0 - Table B: Median Income for Census Tracts Where Rezoned Lots Were Located (2007 $)
🔍 发现表格标题: 行3 - Table C: Median Homeownership Rate for Census Tracts Where Rezoned Lots Were Located (2000)
🔍 发现表格标题: 行6 - Table D: Percent of Rezoned Lots Within 1/2 a Mile of a Rail Station Entrance (2007)
🔧 拆分复合表格: 发现 3 个逻辑表格
   ✅ 创建逻辑表格 1: table_2 - Table B: Median Income for Census Tracts Where Rezoned Lots Were Located (2007 $)
   ✅ 创建逻辑表格 2: table_3 - Table C: Median Homeownership Rate for Census Tracts Where Rezoned Lots Were Located (2000)
   ✅ 创建逻辑表格 3: table_4 - Table D: Percent of Rezoned Lots Within 1/2 a Mile of a Rail Station Entrance (2007)
   📦 创建包装节点，包含 3 个拆分表格
collapsed: <class 'src.core.dom_processor_v2.DOMNode'>
Collapsed DOM saved to: data/collapsed/8e7c4cb542ad160f80fb3d795ada35d8_collapsed.json


DOMNode(tag='div', text='', attributes={'class': 'single-column'}, children=[DOMNode(tag='div', text='', attributes={'class': 'page', 'data-page': '1'}, children=[DOMNode(tag='figure', text='', attributes={}, children=[DOMNode(tag='img', text='', attributes={'src': '8e7c4cb542ad160f80fb3d795ada35d8_artifacts/image_000000_792fdace62b55a9af3fd93e661c187ae595f6c66aac3ec49f73493757b3e9753.png'}, children=[], position={'order': 0}, node_type='image', metadata={'page_number': '1', 'caption': '', 'alt': '', 'title': '', 'global_id': 'image_1'})], position={'order': 3}, node_type='image_container', metadata={'page_number': '1', 'global_id': 'image_container_1'}), DOMNode(tag='figure', text='', attributes={}, children=[DOMNode(tag='img', text='', attributes={'src': '8e7c4cb542ad160f80fb3d795ada35d8_artifacts/image_000001_fc3078cd1bac1b659228b8b84c56fc1a55e52888b66c92d38c62522d6d953c6c.png'}, children=[], position={'order': 0}, node_type='image', metadata={'page_number': '1', 'caption': '', 'alt

In [92]:
nodes[2]

{'tag': 'figure',
 'text': 'NUS National University of Singapore',
 'metadata': {'depth': 1,
  'page_id': None,
  'global_id': 'doc_welcome-to-nus_54390',
  'node_type': None,
  'original_index': 1,
  'parent_chapter': -1,
  'is_chapter_title': False,
  'element_type': 'figure',
  'page_number': 0,
  'bbox': [22.5, 334.0, 111.5, 379.5],
  'image_extracted': True,
  'ai_description': 'Skipped (disabled or too small)'},
 'children': [{'tag': 'img',
   'metadata': {'depth': 2,
    'page_id': None,
    'global_id': 'doc_welcome-to-nus_54391',
    'node_type': None},
   'src': 'welcome-to-nus/page_1_figure_1.png'}],
 'src': 'welcome-to-nus/page_1_figure_1.png',
 'data-page': '0'}

In [1]:
from subtree_embedding.utils import NodeTypeClassifier
from json_to_dom_processor import DOMNode
import json

# 创建分类器实例
classifier = NodeTypeClassifier()

def classify_all_nodes(node_dict):
    result = classifier.classify(node_dict)
    print(f"Node {node_dict['tag']} classified as: {result}")

    # 递归处理子节点
    for child_dict in node_dict.get('children', []):
        classify_all_nodes(child_dict)

with open("./data/dom/MMLongBench-Doc/welcome-to-nus.json", "r") as f:
    tree_data = json.load(f)

# 直接传入字典数据，不需要转换为 DOMNode
classify_all_nodes(tree_data)

Qwen2VL not available. Please install transformers and torch to use Qwen2VL.


Node document classified as: None
Node p classified as: text
Node figure classified as: image
Node img classified as: None
Node figure classified as: image
Node img classified as: None
Node p classified as: text
Node p classified as: text
Node h1 classified as: text
Node p classified as: text
Node figure classified as: image
Node img classified as: None
Node h1 classified as: text
Node table classified as: table
Node tr classified as: None
Node td classified as: None
Node td classified as: None
Node td classified as: None
Node tr classified as: None
Node td classified as: None
Node td classified as: None
Node td classified as: None
Node tr classified as: None
Node td classified as: None
Node td classified as: None
Node td classified as: None
Node tr classified as: None
Node td classified as: None
Node td classified as: None
Node td classified as: None
Node p classified as: text
Node figure classified as: image
Node img classified as: None
Node h2 classified as: text
Node p classified a

In [7]:
from subtree_embedding.utils import StructuralPromptGenerator
import json

# 创建生成器实例
structure = StructuralPromptGenerator()

def structure_all_nodes(node_dict, depth=0):
    result = structure.generate_prompt(node_dict, depth)  # 添加 depth 参数
    print(f"Node {node_dict['tag']} structure as: {result}")

    # 递归处理子节点，深度+1
    for child_dict in node_dict.get('children', []):
        structure_all_nodes(child_dict, depth + 1)

with open("./data/dom/MMLongBench-Doc/welcome-to-nus.json", "r") as f:
    tree_data = json.load(f)

# 直接传入字典数据，从深度0开始
structure_all_nodes(tree_data)

Node document structure as: :
Node p structure as: [Page 1] [Depth 1] [Paragraph] [Pos: 604,21]:
Node figure structure as: [Page 1] [Depth 1] [Figure] [Pos: 22,334]:
Node img structure as: [Depth 2]:
Node figure structure as: [Page 1] [Depth 1] [Figure] [Pos: 209,147]:
Node img structure as: [Depth 2]:
Node p structure as: [Page 1] [Depth 1] [Paragraph] [Pos: 473,622]:
Node p structure as: [Page 1] [Depth 1] [Paragraph] [Pos: 473,640]:
Node h1 structure as: [Page 1] [Depth 1] [Paragraph] [Pos: 473,650] [H1]:
Node p structure as: [Page 1] [Depth 2] [Paragraph] [Pos: 784,628]:
Node figure structure as: [Page 2] [Depth 2] [Figure] [Pos: 454,15]:
Node img structure as: [Depth 3]:
Node h1 structure as: [Page 2] [Depth 1] [Paragraph] [Pos: 476,508] [H1]:
Node table structure as: [Page 2] [Depth 2] [Table] [Pos: 474,530]:
Node tr structure as: [Depth 3]:
Node td structure as: [Depth 4]:
Node td structure as: [Depth 4]:
Node td structure as: [Depth 4]:
Node tr structure as: [Depth 3]:
Node td 