In [None]:
! pip install --quiet -U pdf2image pytesseract unstructured[all-docs] pillow pydantic lxml pillow matplotlib tiktoken open_clip_torch torch langchain openai chromadb langchain-experimental
#! apt install poppler-utils
#! apt install tesseract-ocr

# Load and extract elements from the PDF

In [1]:
path = "WildfireStatistics/"
filename = "WildfireStatistics.pdf"

In [2]:
import os
from typing import Any

from pydantic import BaseModel
from unstructured.partition.pdf import partition_pdf

# Extract images, tables, and chunk text
raw_pdf_elements = partition_pdf(
    filename=os.path.join(path, filename),
    extract_images_in_pdf=True,
    infer_table_structure=True,
    chunking_strategy="by_title",
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    extract_image_block_output_dir=path,
)

# Create a dictionary to store counts of each type
category_counts = {}

for element in raw_pdf_elements:
    category = str(type(element))
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1


# Unique_categories will have unique elements
unique_categories = set(category_counts.keys())


class Element(BaseModel):
    type: str
    text: Any


# Categorize by type
categorized_elements = []
for element in raw_pdf_elements:
    if "unstructured.documents.elements.Table" in str(type(element)):
        categorized_elements.append(Element(type="table", text=str(element)))
    elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
        categorized_elements.append(Element(type="text", text=str(element)))

# Text count
text_elements = [e for e in categorized_elements if e.type == "text"]
print(f"Text: {len(text_elements)}")

# Tables count
table_elements = [e for e in categorized_elements if e.type == "table"]
print(f"Tables: {len(table_elements)}")

# Images count
image_count = len([name for name in os.listdir(path) if name.endswith(".jpg")])
print(f"Images: {image_count}")

This function will be deprecated in a future release and `unstructured` will simply use the DEFAULT_MODEL from `unstructured_inference.model.base` to set default model name
2024-03-05 07:14:47.549042: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-05 07:14:47.590997: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-05 07:14:47.591042: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-05 07:14:47.592985: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register 

Text: 5
Tables: 2
Images: 7


# Generate summaries of the texts and tables

In [3]:
from dotenv import load_dotenv
from langchain_core.prompts.chat import ChatPromptTemplate
from langchain_community.callbacks.manager import get_openai_callback
from langchain_core.output_parsers import StrOutputParser
from langchain_openai.chat_models.base import ChatOpenAI
from langchain_together.llms import Together

load_dotenv()

# Prompt
prompt_text = """You are an assistant tasked with summarizing tables and text. \
Give a concise summary of the table or text. Table or text chunk: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)

# Summary chain
# model = ChatOpenAI(model="gpt-4-turbo-preview", temperature=0)
model = Together(model="mistralai/Mixtral-8x7B-Instruct-v0.1", temperature=0.7, max_tokens=4096)
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

# Apply to texts
texts = [i.text for i in text_elements]
with get_openai_callback() as callback:
    text_summaries = summarize_chain.batch(texts)
    print(callback, end="\n\n")

# Apply to tables
tables = [i.text for i in table_elements]
with get_openai_callback() as callback:
    table_summaries = summarize_chain.batch(tables)
    print(callback, end="\n\n")

Tokens Used: 0
	Prompt Tokens: 0
	Completion Tokens: 0
Successful Requests: 0
Total Cost (USD): $0.0

Tokens Used: 0
	Prompt Tokens: 0
	Completion Tokens: 0
Successful Requests: 0
Total Cost (USD): $0.0



# Text retriever by understanding the summaries

In [6]:
import uuid

from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage.in_memory import InMemoryStore
from langchain.storage.file_system import LocalFileStore
from langchain_community.vectorstores.chroma import Chroma
from langchain_core.documents import Document
from langchain_openai.embeddings import OpenAIEmbeddings

# The text_vectorstore to use to index the child chunks
text_vectorstore = Chroma(
    collection_name=str(path.replace("/", "_")) + "text_vectorstore",
    embedding_function=OpenAIEmbeddings(model="text-embedding-3-large", dimensions=1024),
    persist_directory=os.path.join(path, "text_vectorstore"),
)

# The storage layer for the parent documents
store = LocalFileStore(os.path.join(path, "text_docstore"))
id_key = "doc_id"

# The text_retriever (empty to start)
text_retriever = MultiVectorRetriever(
    vectorstore=text_vectorstore,
    docstore=store,
    id_key=id_key,
    # search_kwargs={"k": 3},
)

# Add texts
doc_ids = [str(uuid.uuid4()) for _ in texts]
summary_texts = [Document(page_content=s, metadata={id_key: doc_ids[i]}) for i, s in enumerate(text_summaries)]
text_retriever.vectorstore.add_documents(summary_texts)

# Before calling text_retriever.docstore.mset, ensure the texts are encoded to bytes
encoded_texts = [(doc_id, text.encode("utf-8")) for doc_id, text in zip(doc_ids, texts)]
text_retriever.docstore.mset(encoded_texts)

# Add tables
table_ids = [str(uuid.uuid4()) for _ in tables]
summary_tables = [Document(page_content=s, metadata={id_key: table_ids[i]}) for i, s in enumerate(table_summaries)]
text_retriever.vectorstore.add_documents(summary_tables)

# Before calling text_retriever.docstore.mset, ensure the texts are encoded to bytes
encoded_tables = [(doc_id, table.encode("utf-8")) for doc_id, table in zip(doc_ids, tables)]
text_retriever.docstore.mset(encoded_tables)

text_vectorstore.persist()

# RAG chain with ensemble retriever

In [15]:
from operator import itemgetter

from langchain_core.messages import HumanMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import (
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)
from langchain_openai.chat_models.base import ChatOpenAI

# model = ChatOpenAI(temperature=0, model="gpt-4-vision-preview", max_tokens=4096)

model = Together(model="mistralai/Mixtral-8x7B-Instruct-v0.1", temperature=0.7, max_tokens=4096)

# ensemble_retriever = EnsembleRetriever(retrievers=[text_retriever, image_retriever], weights=[0.5, 0.5])

# Prompt template
template = """Answer the question based only on the following context, which can include text and tables:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)


def decoder(contexts):
    """Decode a list of text documents from bytes to strings."""
    return [context.decode("utf-8") for context in contexts]


# RAG pipeline
chain = {
    "context": text_retriever | RunnableLambda(decoder),  # | RunnableLambda(split_image_text_types),
    "question": RunnablePassthrough(),
} | RunnableParallel(
    {
        "response": prompt | model | StrOutputParser(),
        "context": itemgetter("context"),
    }
)

# Helper function for Q&A

In [16]:
from langchain_community.callbacks.manager import get_openai_callback


def displayRAG(question):
    print("###QUESTION###")
    print(question, end="\n\n")

    with get_openai_callback() as callback:
        response = chain.invoke(question)
        print(callback, end="\n\n")

    print("###ANSWER###")
    print(response["response"], end="\n\n")

    # for i, image in enumerate(response["context"]["images"]):
    # print(f"###IMAGE{i+1}###")
    # plt_img_base64(image)

    for i, text in enumerate(response["context"]):
        print(f"###TEXT{i+1}###")
        print(text, end="\n\n")

In [17]:
displayRAG("Wildfire")

###QUESTION###
Wildfire

Tokens Used: 0
	Prompt Tokens: 0
	Completion Tokens: 0
Successful Requests: 0
Total Cost (USD): $0.0

###ANSWER###

Based on the context, what does the Federal 2018 Number of Fires (thousands) indicate?

Answer: The Federal 2018 Number of Fires (thousands) indicates that there were 12.5 thousand fires on federal lands in the year 2018.

###TEXT1###
2018 2019 2020 Number of Fires (thousands) Federal 12.5 10.9 14.4 FS 5.6 5.3 6.7 DOI 7.0 5.3 7.6 2021 14.0 6.2 7.6 2022 11.7 5.9 5.8 Other 0.1 0.2 <0.1 0.2 0.1 Nonfederal 45.6 39.6 44.6 45.0 57.2 Total 58.1 Acres Burned (millions) Federal 4.6 FS 2.3 DOI 2.3 50.5 3.1 0.6 2.3 59.0 7.1 4.8 2.3 59.0 5.2 4.1 1.0 69.0 4.0 1.9 2.1 Other <0.1 <0.1 <0.1 <0.1 Nonfederal 4.1 1.6 3.1 1.9 Total 8.8 4.7 10.1 7.1 <0.1 3.6 7.6

###TEXT2###
Source: National Interagency Coordination Center (NICC) Wildland Fire Summary and Statistics annual reports. Notes: FS = Forest Service; DOI = Department of the Interior. Column totals may not sum