In [28]:
# 初始化 PDF文件路径
pdf_path = "../../90-文档_Data/复杂PDF/billionaires_page-1-5.pdf"

#### 1. unstructured 表格提取

In [None]:
from unstructured.partition.pdf import partition_pdf

# 导入 LlamaIndex 相关模块
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

# 全局设置
Settings.llm = OpenAI(model="gpt-3.5-turbo")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

elements = partition_pdf(
    pdf_path,
    strategy="hi_res",  # 使用高精度策略
)  # 解析PDF文档

# 创建一个元素ID到元素的映射
element_map = {element.id: element for element in elements if hasattr(element, 'id')}

# 创建一个元素索引到元素的映射
element_index_map = {i: element for i, element in enumerate(elements)}

for i, element in enumerate(elements):
    if element.category == "Table":
        print("\n表格数据:")
        print("表格元数据:", vars(element.metadata))  # 使用vars()显示所有元数据属性
        print("表格内容:")
        print(element.text)  # 打印表格文本内容
        
        # 获取并打印父节点信息
        parent_id = getattr(element.metadata, 'parent_id', None)
        if parent_id and parent_id in element_map:
            parent_element = element_map[parent_id]
            print("\n父节点信息:")
            print(f"类型: {parent_element.category}")
            print(f"内容: {parent_element.text}")
            if hasattr(parent_element, 'metadata'):
                print(f"父节点元数据: {vars(parent_element.metadata)}")
        else:
            print(f"未找到父节点 (ID: {parent_id})")
            
        # 打印表格前3个节点的内容
        print("\n表格前3个节点内容:")
        for j in range(max(0, i-3), i):
            prev_element = element_index_map.get(j)
            if prev_element:
                print(f"节点 {j} ({prev_element.category}):")
                print(prev_element.text)
                
        print("-" * 50)

text_elements = [el for el in elements if el.category == "Text"]
table_elements = [el for el in elements if el.category == "Table"]

#### 2. 使用LlamaParse解析PDF

In [30]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings
from llama_parse import LlamaParse
import time
from dotenv import load_dotenv
import nest_asyncio
nest_asyncio.apply()

# 加载环境变量（确保有OpenAI API密钥）
load_dotenv()

# 设置基础模型
embed_model = OpenAIEmbedding(model="text-embedding-3-small")
llm = OpenAI(model="gpt-3.5-turbo-0125")

Settings.llm = llm
Settings.embed_model = embed_model

# 记录开始时间
start_time = time.time()

# 使用LlamaParse解析PDF
documents = LlamaParse(result_type="markdown").load_data(pdf_path)

# 记录结束时间
end_time = time.time()
print(f"PDF解析耗时: {end_time - start_time:.2f}秒")

# 打印解析结果
print("\n解析后的文档内容:")
for i, doc in enumerate(documents, 1):
    print(f"\n文档 {i} 内容:")
    print(doc.text)

Started parsing the file under job_id 5cf4e779-82bb-443a-acc7-1cf2ce5c0173
PDF解析耗时: 22.29秒

解析后的文档内容:

文档 1 内容:
# The World's Billionaires

The World's Billionaires is an annual ranking of people who are considered to have a net worth of $1 billion or more, by the American business magazine Forbes. The list was first published in March 1987.1 The total net worth of each individual on the list is estimated and is cited in United States dollars, based on their documented assets and accounting for debt and other factors. Royalty and dictators whose wealth comes from their positions are excluded from these lists.3 This ranking is an index of the wealthiest documented individuals, excluding any ranking of those with wealth that is not able to be completely ascertained.4

In 2018, Amazon founder Jeff Bezos was ranked at the top for the first time and became the first centibillionaire included in the ranking,5 surpassing Microsoft founder Bill Gates, who had topped the list 18 of the previous