In [1]:
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv(".env"))  # read local .env file

In [2]:
from pathlib import Path

In [3]:
from llama_index.llms import AzureOpenAI
from llama_index.multi_modal_llms.azure_openai import AzureOpenAIMultiModal

In [4]:
from llama_index.llms import AzureOpenAI
from llama_index.embeddings import AzureOpenAIEmbedding
from llama_index import ServiceContext
from llama_index import set_global_service_context


def set_azure():
    llm = AzureOpenAI(
        azure_deployment="gpt-4-1106",
        openai_api_version="2023-07-01-preview",
    )

    # You need to deploy your own embedding model as well as your own chat completion model
    embed_model = AzureOpenAIEmbedding(
        deployment_name="text-embedding-ada-002",
        api_version="2023-07-01-preview",
    )

    service_context = ServiceContext.from_defaults(
        llm=llm,
        embed_model=embed_model,
    )

    set_global_service_context(service_context)
    
set_azure()

## Set up the processor and the parser

In [5]:
text_model = AzureOpenAI(
    azure_deployment="gpt-4-1106",
    openai_api_version="2023-07-01-preview",
)
vision_model = AzureOpenAIMultiModal(
    azure_deployment="gpt-4-v",
    openai_api_version="2023-07-01-preview",
    max_new_tokens=4096,
)

In [6]:
from pdf_processor.core import MathpixProcessor, MathpixResultParser
from pdf_processor.core import MathpixResult, PdfResult

In [7]:
processor = MathpixProcessor()
parser = MathpixResultParser(text_model, vision_model)

## Load example PDF

In [8]:
pdf_path = Path("inbox/2312.10997.pdf")

In [ ]:
result_path = Path("results").joinpath(pdf_path.stem)
result_path.mkdir(exist_ok=True, parents=True)

## Run PDF through Mathpix API

Skip the processing and load the result from a JSON

In [9]:
# mathpix_result = processor.submit_pdf(pdf_path)
# mathpix_result = processor.await_result(mathpix_result, timeout_s=120)

In [11]:
# with result_path.joinpath("mathpix_result.json").open("w") as f:
#     f.write(mathpix_result.model_dump_json())

In [None]:
with result_path.joinpath("mathpix_result.json").open("r") as f:
    mathpix_result = MathpixResult.model_validate_json(f.read())

## Parse Mathpix output through the parser

Same deal here, load from JSON

In [None]:
# pdf_result = parser.parse_result(mathpix_result)

In [None]:
# with result_path.joinpath("pdf_result-2.json").open("w") as f:
#     f.write(pdf_result.model_dump_json())

In [12]:
with result_path.joinpath("pdf_result-2.json").open("r") as f:
    pdf_result = PdfResult.model_validate_json(f.read())

## Convert PdfResult into LlamaIndex nodes

In [35]:
from llama_index.schema import TextNode, ImageNode, IndexNode, NodeRelationship

In [36]:
base_nodes = []
index_nodes = []

In [25]:
from llama_index.node_parser import SentenceSplitter
from llama_index import Document


splitter = SentenceSplitter()
for text_chunk in pdf_result.text:
    doc = Document(text=text_chunk.processed_content)
    nodes = splitter.get_nodes_from_documents([doc])
    
    for node in nodes:
        node.start_char_idx += text_chunk.start
        node.end_char_idx += text_chunk.end
        
    base_nodes.extend(nodes)

In [26]:
for table_chunk in pdf_result.tables:
    base_node = TextNode(
        text=table_chunk.raw_content,
        start_char_idx=table_chunk.start,
        end_char_idx=table_chunk.end,
    )
    index_node = IndexNode.from_text_node(
        node=TextNode(
            text=table_chunk.processed_content,
            start_char_idx=table_chunk.start,
            end_char_idx=table_chunk.end,
        ),
        index_id=base_node.node_id,
    )
    base_nodes.append(base_node)
    index_nodes.append(index_node)


Need to figure out how to treat images

In [None]:
# for image_chunk in pdf_result.images:
#     base_node = ImageNode(  # no text here
#         image=image_chunk.file_b64,
#         image_path=image_chunk.filename,
#         image_mimetype="image/jpeg",
#         start_char_idx=image_chunk.start,
#         end_char_idx=image_chunk.end,
#     )
#     index_node = IndexNode.from_text_node(
#         node=TextNode(
#             text=image_chunk.processed_content,
#             start_char_idx=image_chunk.start,
#             end_char_idx=image_chunk.end,
#         ),
#         index_id=base_node.node_id,
#     )
#     base_nodes.append(base_node)
#     index_nodes.append(index_node)


Set neighboring nodes relationships

In [27]:
def find_node_by_id(in_nodes, node_id):
    for node in in_nodes:
        if node.node_id == node_id:
            return node
    return None

In [28]:
base_nodes = sorted(base_nodes, key=lambda node: node.start_char_idx)

for i, node in enumerate(base_nodes):
    if i > 0:
        node.relationships[NodeRelationship.PREVIOUS] = base_nodes[i - 1].as_related_node_info()
    if i < len(base_nodes) - 1:
        node.relationships[NodeRelationship.NEXT] = base_nodes[i + 1].as_related_node_info()

for i, node in enumerate(index_nodes):
    node.relationships = find_node_by_id(base_nodes, node.index_id).relationships


## Set up LlamaIndex hierarchical retrieval

In [29]:
from llama_index.node_parser import UnstructuredElementNodeParser

node_parser = UnstructuredElementNodeParser()
new_base_nodes, node_mappings = node_parser.get_base_nodes_and_mappings(base_nodes + index_nodes)

In [30]:
from llama_index.query_engine import RetrieverQueryEngine
from llama_index import VectorStoreIndex

In [32]:
# construct top-level vector index + query engine
vector_index = VectorStoreIndex(new_base_nodes)
vector_retriever = vector_index.as_retriever(similarity_top_k=1)
vector_query_engine = vector_index.as_query_engine(similarity_top_k=1)

In [33]:
from llama_index.retrievers import RecursiveRetriever

recursive_retriever = RecursiveRetriever(
    "vector",
    retriever_dict={"vector": vector_retriever},
    node_dict=node_mappings,
    verbose=True,
)
query_engine = RetrieverQueryEngine.from_args(recursive_retriever)

In [34]:
response = query_engine.query("What paradigms of RAG exist?")
print(str(response))

There are three paradigms of RAG: Naive RAG, Advanced RAG, and Modular RAG. Advanced RAG is a specialized form of Modular RAG, and Naive RAG is a special case of Advanced RAG.
