In [1]:
import nest_asyncio
nest_asyncio.apply()

In [21]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Initialize OpenAI API client with API key from environment variables
openai_api_key = os.getenv('OPENAI_API_KEY')
anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
llama_cloud_api_key = os.getenv('LLAMA_CLOUD_API_KEY')

In [3]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core import Settings

embed_model = OpenAIEmbedding(model="text-embedding-3-small")
llm = OpenAI(model="gpt-4o")

Settings.llm = llm
Settings.embed_model = embed_model

##### Using MarkdownElementNodeParser for parsing the LlamaParse output Markdown results and building recursive retriever query engine for generation.

In [9]:
from llama_parse import LlamaParse

# Define the path to the text file within the 'docs' directory
text_file_path = './docs/article_2024-08-16_2024-08-29.txt'


# Parse the text data as markdown
documents = LlamaParse(result_type="markdown").load_data("./docs/article_2024-08-16_2024-08-29.txt")



Started parsing the file under job_id 7d0ca18e-660a-407c-868d-3417f465b74b
.......

In [23]:
from copy import deepcopy
from llama_index.core.schema import TextNode
from llama_index.core import VectorStoreIndex


def get_page_nodes(docs, separator="\n========================================\n"):
    """Split each document into page node, by separator."""
    nodes = []
    for doc in docs:
        doc_chunks = doc.text.split(separator)
        for doc_chunk in doc_chunks:
            node = TextNode(
                text=doc_chunk,
                metadata=deepcopy(doc.metadata),
            )
            nodes.append(node)

    return nodes

In [24]:
page_nodes = get_page_nodes(documents)

In [25]:
from llama_index.core.node_parser import MarkdownElementNodeParser

node_parser = MarkdownElementNodeParser(
    llm=OpenAI(model="gpt-4o"), num_workers=8
)

In [None]:
nodes = node_parser.get_nodes_from_documents(documents)

In [29]:
base_nodes, objects = node_parser.get_nodes_and_objects(nodes)

In [36]:
objects[0].get_content()


'This table shows the emergency disaster response scores of four countries: the Philippines, Cambodia, Vietnam, and Thailand.,\nwith the following columns:\n- ประเทศ: None\n- คะแนนการรับมือเหตุภัยพิบัติฉุกเฉิน: None\n'

In [38]:
# dump both indexed tables and page text into the vector index
recursive_index = VectorStoreIndex(nodes=base_nodes + objects + page_nodes)

In [None]:
print(page_nodes[31].get_content())

In [42]:
from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker

reranker = FlagEmbeddingReranker(
    top_n=5,
    model="BAAI/bge-reranker-large",
)

recursive_query_engine = recursive_index.as_query_engine(
    similarity_top_k=5, node_postprocessors=[reranker], verbose=True
)

In [40]:
print(len(nodes))

1285


#### **Setup Baseline** ####
For comparison, we setup a naive RAG pipeline with default parsing and standard chunking, indexing, retrieval.

In [43]:
from llama_index.core import SimpleDirectoryReader

reader = SimpleDirectoryReader(input_files=["./docs/article_2024-08-16_2024-08-29.txt"])
base_docs = reader.load_data()
raw_index = VectorStoreIndex.from_documents(base_docs)
raw_query_engine = raw_index.as_query_engine(
    similarity_top_k=5, node_postprocessors=[reranker]
)

In [44]:
while True:
    # Prompt the user for a query
    query = input("Enter your query (or type 'exit' to quit): ")
    
    # Exit the loop if the user types 'exit'
    if query.lower() == 'exit':
        print("Exiting the query loop.")
        break
    
    # Execute the query using the raw query engine
    response_1 = raw_query_engine.query(query)
    print("\n***********Basic Query Engine***********")
    print(response_1)
    
    # Execute the query using the recursive query engine
    response_2 = recursive_query_engine.query(query)
    print("\n***********New LlamaParse+ Recursive Retriever Query Engine***********")
    print(response_2)



***********Basic Query Engine***********
The news covers a span of 14 days.

***********New LlamaParse+ Recursive Retriever Query Engine***********
The news covers events from August 26, 2024.

***********Basic Query Engine***********
There are no news articles related to politics from the 25th of August in the provided information. The articles listed are from dates ranging from the 16th to the 23rd of August.
[1;3;38;2;11;159;203mRetrieval entering c8452cce-0477-45a4-a946-5c3c0491ff02: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query can you highlight news related to poltics from 25th of Auguest? list them out and explain
[0m[1;3;38;2;11;159;203mRetrieval entering a3aee2f5-3123-4710-8a06-79a949972f04: TextNode
[0m[1;3;38;2;237;90;200mRetrieving from object TextNode with query can you highlight news related to poltics from 25th of Auguest? list them out and explain
[0m[1;3;38;2;11;159;203mRetrieval entering 6b7f6476-13ea-4da5-a62a-aa6feed74d40: Tex