### Imports and Setup

In [34]:
from llama_index.core.schema import NodeWithScore
from llama_index.core.workflow import Event


class RetrieverEvent(Event):
    """Result of running retrieval"""

    nodes: list[NodeWithScore]


class RerankEvent(Event):
    """Result of running reranking on retrieved nodes"""

    nodes: list[NodeWithScore]


In [35]:
from os import listdir as os_listdir

from llama_index.core import (
    Document,
    StorageContext,
    VectorStoreIndex,
    load_index_from_storage,
)
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.core.response_synthesizers import CompactAndRefine
from llama_index.core.workflow import Context, StartEvent, StopEvent, Workflow, step
from llama_index.readers.file import PDFReader


RETRIEVER_TOP_N = 3

class AnnexRAGWorkflow(Workflow):

    def __init__(self, llm, embed_model, reranker_model, index_path=None):
        super().__init__()
        self.index_id = "vector_index_for_annex"
        self.llm = llm
        self.embed_model = embed_model
        self.reranker_model = reranker_model
        self.index_path = index_path
        if self.index_path is None:
            self.index = None
        else:
            storage_context = StorageContext.from_defaults(persist_dir=self.index_path)
            self.index = load_index_from_storage(
                storage_context, index_id=self.index_id
            )

        self.url_mapping = {
            "annexb1.pdf": "https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexb1.pdf",
            "annexb2.pdf": "https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexb2.pdf",
            "annexc1.pdf": "https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexc1.pdf",
            "annexc2.pdf": "https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexc2.pdf",
            "annexd1.pdf": "https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexd1.pdf",
            "annexe1.pdf": "https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexe1.pdf",
            "annexe2.pdf": "https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexe2.pdf",
            "annexf1.pdf": "https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexf1.pdf",
            "annexf2.pdf": "https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexf2.pdf",
            "annexf3.pdf": "https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexf3.pdf",
            "annexf4.pdf": "https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexf4.pdf",
            "annexg1.pdf": "https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexg1.pdf",
            "annexg2.pdf": "https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexg2.pdf",
            "annexh1.pdf": "https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexh1.pdf",
            "annexh2.pdf": "https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexh2.pdf",
            "annexi1.pdf": "https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexi1.pdf",
            "budget_booklet_pg6_pg7_calendar.txt": "https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/fy2024_disbursement_calendar_english.pdf",
            "budget_booklet_pg8_household_support.txt": "https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/fy2024_support_for_singaporeans_english.pdf",
            "budget_booklet_pg8_individual_support.txt": "https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/fy2024_support_for_singaporeans_english.pdf",
        }

    @step
    async def ingest(self, ctx: Context, ev: StartEvent) -> StopEvent | None:
        """Entry point to ingest a document, triggered by a StartEvent with `dirname`."""
        dir_name = ev.get("dir_name")
        save_dir = ev.get("save_dir")
        if not dir_name or not save_dir:
            return None

        # Load documents
        loader = PDFReader()
        documents = []
        for filename in os_listdir(dir_name):
            if filename.endswith(".pdf"):
                doc_pages = loader.load_data(f"{dir_name}/{filename}")
                doc_text = "\n\n".join([d.get_content() for d in doc_pages])

            elif filename.endswith(".txt"):
                with open(f"{dir_name}/{filename}", "r") as f:
                    doc_text = f.read()

            else:
                continue

            new_doc = Document(text=doc_text)
            new_doc.metadata = {
                "filename": filename,
                "url": self.url_mapping.get(filename, ""),
            }
            documents.append(new_doc)
            print(f"Loaded document: {new_doc.metadata}")

        # Make nodes from documents
        # splitter = TokenTextSplitter(
        #     chunk_size=8191,
        #     chunk_overlap=0,
        #     separator=" ",
        # )
        # nodes = splitter.get_nodes_from_documents(documents)

        # # Make index from nodes
        # self.index = VectorStoreIndex(
        #     nodes=nodes,
        #     embed_model=self.embed_model,
        # )

        # Make index
        self.index = VectorStoreIndex.from_documents(
            documents,
            transformations=[TokenTextSplitter(chunk_size=8191, chunk_overlap=0, separator=" ")],
        )

        # Save index to disk
        self.index.set_index_id(self.index_id)
        self.index.storage_context.persist(save_dir)

        # Set attributes and return results
        self.index_path = save_dir
        return StopEvent(result=(self.index, self.index_path))

    @step
    async def retrieve(self, ctx: Context, ev: StartEvent) -> RetrieverEvent | None:
        "Entry point for RAG, triggered by a StartEvent with `query`."
        query = ev.get("query")
        if not query:
            return None

        print(f"Query the database with: {query}")

        # store the query in the global context
        await ctx.set("query", query)
        if self.index is None:
            print("Index is empty, load some documents before querying!")
            return None

        retriever = self.index.as_retriever(
            similarity_top_k=RETRIEVER_TOP_N,
            embed_model=self.embed_model,
        )
        nodes = await retriever.aretrieve(query)
        print(f"Retrieved {len(nodes)} nodes.")
        return RetrieverEvent(nodes=nodes)

    @step
    async def rerank(self, ctx: Context, ev: RetrieverEvent) -> RerankEvent:
        ranker = self.reranker_model
        print(await ctx.get("query", default=None), flush=True)
        new_nodes = ranker.postprocess_nodes(
            ev.nodes, query_str=await ctx.get("query", default=None)
        )
        print(f"Reranked nodes to {len(new_nodes)}")

        # TODO: Remove later once debugging is done
        # response = None
        print(ev.nodes)
        print("---")

        for node in ev.nodes:
            print(node.metadata)
            print("---------------")
        print()

        print(new_nodes)
        print("---")
        print(new_nodes[0].text)
        print()

        return RerankEvent(nodes=new_nodes)

    @step
    async def synthesize(self, ctx: Context, ev: RerankEvent) -> StopEvent:
        """Return a streaming response using reranked nodes."""
        summarizer = CompactAndRefine(llm=self.llm, streaming=True, verbose=True)
        query = await ctx.get("query", default=None)

        response = await summarizer.asynthesize(query, nodes=ev.nodes)
        return StopEvent(result=response)

In [36]:
from llama_index.core.postprocessor.llm_rerank import LLMRerank
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI

llm = OpenAI(model_name="gpt-4o-mini", temperature=0)
embed_model = OpenAIEmbedding(model_name="text-embedding-3-small")
reranker_model = LLMRerank(
    choice_batch_size=5, top_n=1, llm=OpenAI(model="gpt-4o-mini", temperature=0)
)

workflow = AnnexRAGWorkflow(
    llm=llm, embed_model=embed_model, reranker_model=reranker_model, index_path="../data/index_storage_for_annex"
)

In [37]:
index = workflow.index
retriever = index.as_retriever(similarity_top_k=3)
print(len(retriever._node_ids))

19


### Run Queries

In [38]:
# Run a query
result = await workflow.run(query="Am I eligible for the Majulah Package?")
async for chunk in result.async_response_gen():
    print(chunk, end="", flush=True)

Query the database with: Am I eligible for the Majulah Package?
Retrieved 3 nodes.
Am I eligible for the Majulah Package?
Reranked nodes to 1
[NodeWithScore(node=TextNode(id_='54139a91-ae95-4cb8-a494-3974586bb1d1', embedding=None, metadata={'filename': 'annexf2.pdf', 'url': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexf2.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='12bf3cbb-3b3a-41cb-a2ca-0246b8ed980a', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'filename': 'annexf2.pdf', 'url': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexf2.pdf'}, hash='8cd7166746287ea4d4f5268a1c17956bcc67ee3f43e1a71ef57bdc7a5d2d5a0f')}, text='1 \nMINISTRY OF FINANCE \nANNEX F-2: MAJULAH PACKAGE \n \nThe Majulah Package aims to provide Singapore Citizens born in 1973 or earlier, especially \nthose born in 1960 to 1973 (“Young Seniors”, currently in thei

In [39]:
# Run a query
result = await workflow.run(query="Am I eligible for the Earn and Save Bonus in the Majulah Package?")
async for chunk in result.async_response_gen():
    print(chunk, end="", flush=True)

Query the database with: Am I eligible for the Earn and Save Bonus in the Majulah Package?
Retrieved 3 nodes.
Am I eligible for the Earn and Save Bonus in the Majulah Package?
Reranked nodes to 1
[NodeWithScore(node=TextNode(id_='54139a91-ae95-4cb8-a494-3974586bb1d1', embedding=None, metadata={'filename': 'annexf2.pdf', 'url': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexf2.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='12bf3cbb-3b3a-41cb-a2ca-0246b8ed980a', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'filename': 'annexf2.pdf', 'url': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexf2.pdf'}, hash='8cd7166746287ea4d4f5268a1c17956bcc67ee3f43e1a71ef57bdc7a5d2d5a0f')}, text='1 \nMINISTRY OF FINANCE \nANNEX F-2: MAJULAH PACKAGE \n \nThe Majulah Package aims to provide Singapore Citizens born in 1973 or earlier, especially \nthose bo

In [40]:
# Run a query
result = await workflow.run(query="What are the quantums I will get for the Earn and Save Bonus in the Majulah Package?")
async for chunk in result.async_response_gen():
    print(chunk, end="", flush=True)

Query the database with: What are the quantums I will get for the Earn and Save Bonus in the Majulah Package?
Retrieved 3 nodes.
What are the quantums I will get for the Earn and Save Bonus in the Majulah Package?
Reranked nodes to 1
[NodeWithScore(node=TextNode(id_='54139a91-ae95-4cb8-a494-3974586bb1d1', embedding=None, metadata={'filename': 'annexf2.pdf', 'url': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexf2.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='12bf3cbb-3b3a-41cb-a2ca-0246b8ed980a', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'filename': 'annexf2.pdf', 'url': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexf2.pdf'}, hash='8cd7166746287ea4d4f5268a1c17956bcc67ee3f43e1a71ef57bdc7a5d2d5a0f')}, text='1 \nMINISTRY OF FINANCE \nANNEX F-2: MAJULAH PACKAGE \n \nThe Majulah Package aims to provide Singapore Citizens born in 

In [41]:
# Run a query
result = await workflow.run(query="What are the payouts I can expect to receive in December 2024?")
print("------------------------------")
async for chunk in result.async_response_gen():
    print(chunk, end="", flush=True)

Query the database with: What are the payouts I can expect to receive in December 2024?
Retrieved 3 nodes.
What are the payouts I can expect to receive in December 2024?
Reranked nodes to 1
[NodeWithScore(node=TextNode(id_='93560fa6-e2ed-440b-a611-dd0b365f39c4', embedding=None, metadata={'filename': 'budget_booklet_pg6_pg7_calendar.txt', 'url': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/fy2024_disbursement_calendar_english.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='7b849e72-691e-4ffe-a53d-4e74190d1924', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'filename': 'budget_booklet_pg6_pg7_calendar.txt', 'url': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/fy2024_disbursement_calendar_english.pdf'}, hash='ff8ce75984a700e9b89fbe1e2cf37ec46faf2ad6c5046d425cfab337138bac8f')}, text="Assurance For You\nThese are the cash and other benefits to

In [42]:
# Run a query
result = await workflow.run(query="What are the quantums I will get for the Medisave Bonus in the Majulah Package?")
async for chunk in result.async_response_gen():
    print(chunk, end="", flush=True)

Query the database with: What are the quantums I will get for the Medisave Bonus in the Majulah Package?
Retrieved 3 nodes.
What are the quantums I will get for the Medisave Bonus in the Majulah Package?
Reranked nodes to 1
[NodeWithScore(node=TextNode(id_='1e7e1649-2d6d-4491-b3fc-6fa0979ce8af', embedding=None, metadata={'filename': 'annexf3.pdf', 'url': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexf3.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='74f9973f-80ea-491f-a0a4-1dadd7c4cb1d', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'filename': 'annexf3.pdf', 'url': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexf3.pdf'}, hash='2abf149bc82c595b8e804a0fdca4692e546b84ec68acccc75d021f8f47ee52e1')}, text='1 \nMINISTRY OF FINANCE \nANNEX F-3: ONE-TIME MEDISAVE BONUS \n \nAs part of the Government’s efforts to help Singaporeans offset h

In [43]:
# Run a query
result = await workflow.run(query="What are the quantums I will get for the Medisave Bonus in the Majulah Package? I am born in 1955.")
async for chunk in result.async_response_gen():
    print(chunk, end="", flush=True)

Query the database with: What are the quantums I will get for the Medisave Bonus in the Majulah Package? I am born in 1955.
Retrieved 3 nodes.
What are the quantums I will get for the Medisave Bonus in the Majulah Package? I am born in 1955.
Reranked nodes to 1
[NodeWithScore(node=TextNode(id_='54139a91-ae95-4cb8-a494-3974586bb1d1', embedding=None, metadata={'filename': 'annexf2.pdf', 'url': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexf2.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='12bf3cbb-3b3a-41cb-a2ca-0246b8ed980a', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'filename': 'annexf2.pdf', 'url': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexf2.pdf'}, hash='8cd7166746287ea4d4f5268a1c17956bcc67ee3f43e1a71ef57bdc7a5d2d5a0f')}, text='1 \nMINISTRY OF FINANCE \nANNEX F-2: MAJULAH PACKAGE \n \nThe Majulah Package aims to provide

In [44]:
# Run a query
result = await workflow.run(query="What are the quantums I will get for the Medisave Bonus in the Majulah Package? I am born in 1965 and have 1 property with an annual value of $26,000.")
async for chunk in result.async_response_gen():
    print(chunk, end="", flush=True)

Query the database with: What are the quantums I will get for the Medisave Bonus in the Majulah Package? I am born in 1965 and have 1 property with an annual value of $26,000.
Retrieved 3 nodes.
What are the quantums I will get for the Medisave Bonus in the Majulah Package? I am born in 1965 and have 1 property with an annual value of $26,000.
Reranked nodes to 1
[NodeWithScore(node=TextNode(id_='1e7e1649-2d6d-4491-b3fc-6fa0979ce8af', embedding=None, metadata={'filename': 'annexf3.pdf', 'url': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexf3.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='74f9973f-80ea-491f-a0a4-1dadd7c4cb1d', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'filename': 'annexf3.pdf', 'url': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexf3.pdf'}, hash='2abf149bc82c595b8e804a0fdca4692e546b84ec68acccc75d021f8f47ee52e1

In [45]:
# Run a query
result = await workflow.run(query="What are the quantums I will get for the Medisave Bonus in the Majulah Package? I am born in 1965 and have 1 property with an annual value of $30,000.")
async for chunk in result.async_response_gen():
    print(chunk, end="", flush=True)

Query the database with: What are the quantums I will get for the Medisave Bonus in the Majulah Package? I am born in 1965 and have 1 property with an annual value of $30,000.
Retrieved 3 nodes.
What are the quantums I will get for the Medisave Bonus in the Majulah Package? I am born in 1965 and have 1 property with an annual value of $30,000.
Reranked nodes to 1
[NodeWithScore(node=TextNode(id_='1e7e1649-2d6d-4491-b3fc-6fa0979ce8af', embedding=None, metadata={'filename': 'annexf3.pdf', 'url': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexf3.pdf'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='74f9973f-80ea-491f-a0a4-1dadd7c4cb1d', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'filename': 'annexf3.pdf', 'url': 'https://www.mof.gov.sg/docs/librariesprovider3/budget2024/download/pdf/annexf3.pdf'}, hash='2abf149bc82c595b8e804a0fdca4692e546b84ec68acccc75d021f8f47ee52e1