# Partition PDF tables and text

In [1]:
from typing import Any

from pydantic import BaseModel
from unstructured.partition.pdf import partition_pdf

path = "../data/raw/mvt_zinc/reports_processed/Bongará Zn 3-2019.pdf"

# Get elements
raw_pdf_elements = partition_pdf(
    filename=path,
    # Unstructured first finds embedded image blocks
    extract_images_in_pdf=False,
    # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
    # Titles are any sub-section of the document
    infer_table_structure=True,
    # Post processing to aggregate text once we have the title
    chunking_strategy="by_title",
    # Chunking params to aggregate text blocks
    # Attempt to create a new chunk 3800 chars
    # Attempt to keep chunks > 2000 chars
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
)

This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name
Some weights of the model checkpoint at microsoft/table-transformer-structure-recognition were not used when initializing TableTransformerForObjectDetection: ['model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TableTransformerForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSe

In [2]:
# Create a dictionary to store counts of each type
category_counts = {}

for element in raw_pdf_elements:
    category = str(type(element))
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1

# Unique_categories will have unique elements
unique_categories = set(category_counts.keys())
category_counts

{"<class 'unstructured.documents.elements.CompositeElement'>": 180,
 "<class 'unstructured.documents.elements.Table'>": 81,
 "<class 'unstructured.documents.elements.TableChunk'>": 4}

In [10]:
class Element(BaseModel):
    type: str
    ele: Any


# Categorize by type
table_elements = []
text_elements = []
for element in raw_pdf_elements:
    if "unstructured.documents.elements.Table" in str(type(element)):
        table_elements.append(element)
    elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
        text_elements.append(element)

# Tables
print(f"{len(table_elements)=}")

# Text
print(f"{len(text_elements)=}")

len(table_elements)=85
len(text_elements)=180


In [11]:
# save table_elements and text elements to two txt files
with open("table_elements.txt", "w") as f:
    for ele in table_elements:
        f.write(f"{ele.text}\n")

with open("text_elements.txt", "w") as f:
    for ele in text_elements:
        f.write(f"{ele.text}\n")

# Multi-vector retriever

## Summaries

In [4]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

In [12]:
# Prompt
prompt_text = """You are an assistant tasked with summarizing tables and text. \ 
Give a concise summary of the table or text. Table or text chunk: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)

# Summary chain
model = ChatOpenAI(temperature=0, model="gpt-4-turbo-preview")
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

In [13]:
# Apply to tables
tables = [i.text for i in table_elements]
table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})

In [14]:
# Apply to texts
texts = [i.text for i in text_elements]
text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})

In [16]:
table_summaries[:2]

["The introduction section of the document is divided into five subsections. It begins with a general introduction, followed by the terms of reference. The third subsection discusses the sources of information used in the document. The fourth subsection explains the units and currency used, and the final subsection outlines the risk factors associated with the document's content.",
 "The document section covers the description and location of a property, including its general location and administration status. It also discusses Zinc One's interest in the property, environmental and permitting issues, and the social or community impact of the property."]

## Add to vectorstore

Use Multi Vector Retriever with summaries:
- `InMemoryStore` stores the raw text, tables
- `vectorstore` stores the embedded summaries

In [17]:
import uuid

from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings

# The vectorstore to use to index the child chunks
vectorstore = Chroma(collection_name="summaries", embedding_function=OpenAIEmbeddings())

# The storage layer for the parent documents
store = InMemoryStore()
id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

# Add texts
doc_ids = [str(uuid.uuid4()) for _ in texts]
summary_texts = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(text_summaries)
]
retriever.vectorstore.add_documents(summary_texts)
retriever.docstore.mset(list(zip(doc_ids, texts)))

# Add tables
table_ids = [str(uuid.uuid4()) for _ in tables]
summary_tables = [
    Document(page_content=s, metadata={id_key: table_ids[i]})
    for i, s in enumerate(table_summaries)
]
retriever.vectorstore.add_documents(summary_tables)
retriever.docstore.mset(list(zip(table_ids, tables)))

In [18]:
from langchain_core.runnables import RunnablePassthrough

# Prompt template
template = """Answer the question based only on the following context, which can include text and tables:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# LLM
model = ChatOpenAI(temperature=0, model="gpt-4-turbo-preview")

# RAG pipeline
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [23]:
chain.invoke("Where is mineral site Bongará Zn located?")

'The Bongará zinc mineralization site is located within the Pucará Basin of western Peru.'

We can check trace to see what chunks were retrieved.