# Basic RAG with llama-index

- LLM: GPT-4o mini
- Embedder: BAAI/bge-small-en-v1.5
- Reranker: BAAI/bge-reranker-base

### Imports

In [1]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.openai import OpenAI

from llama_index.core.workflow import Event
from llama_index.core.schema import NodeWithScore


from llama_index.postprocessor.flag_embedding_reranker import FlagEmbeddingReranker

from llama_index.core.response_synthesizers import CompactAndRefine
from llama_index.core.workflow import (
    Context,
    Workflow,
    StartEvent,
    StopEvent,
    step,
)

from dotenv import load_dotenv
import nest_asyncio

### Initial Setup

In [2]:
load_dotenv("../.env")
nest_asyncio.apply()

In [3]:
# loads GPT-4o MINI from OpenAI
llm = OpenAI(model_name="gpt-4o-mini", temperature=0)
# Settings.llm = llm # check if this is needed

# loads BAAI/bge-small-en-v1.5
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
# Settings.embed_model = embed_model # check if this is needed

# loads BAAI/bge-reranker-base
reranker_model = FlagEmbeddingReranker(model="BAAI/bge-reranker-base", top_n=3)

### RAG with llama-index Workflows

In [4]:
class RetrieverEvent(Event):
    """Result of running retrieval"""

    nodes: list[NodeWithScore]


class RerankEvent(Event):
    """Result of running reranking on retrieved nodes"""

    nodes: list[NodeWithScore]

In [5]:
class RAGWorkflow(Workflow):

    def __init__(self, llm, embed_model, reranker_model, index=None):
        super().__init__()
        self.llm = llm
        self.embed_model = embed_model
        self.reranker_model = reranker_model
        self.index = index      

    @step
    async def ingest(self, ctx: Context, ev: StartEvent) -> StopEvent | None:
        """Entry point to ingest a document, triggered by a StartEvent with `dirname`."""
        dirname = ev.get("dirname")
        if not dirname:
            return None

        documents = SimpleDirectoryReader(dirname).load_data()
        self.index = VectorStoreIndex.from_documents(
            documents=documents,
            embed_model=self.embed_model,
            # embed_model=OpenAIEmbedding(model_name="text-embedding-3-small"),
        )
        return StopEvent(result=self.index)

    @step
    async def retrieve(
        self, ctx: Context, ev: StartEvent
    ) -> RetrieverEvent | None:
        "Entry point for RAG, triggered by a StartEvent with `query`."
        query = ev.get("query")
        # index = ev.get("index")

        if not query:
            return None

        print(f"Query the database with: {query}")

        # store the query in the global context
        await ctx.set("query", query)

        # get the index from the global context
        # if index is None:
        #     print("Index is empty, load some documents before querying!")
        #     return None
        if self.index is None:
            print("Index is empty, load some documents before querying!")
            return None

        retriever = self.index.as_retriever(similarity_top_k=2)
        nodes = await retriever.aretrieve(query)
        print(f"Retrieved {len(nodes)} nodes.")
        return RetrieverEvent(nodes=nodes)

    @step
    async def rerank(self, ctx: Context, ev: RetrieverEvent) -> RerankEvent:
        # Rerank the nodes
        # ranker = LLMRerank(
        #     choice_batch_size=5, top_n=3, llm=OpenAI(model="gpt-4o-mini")
        # )
        ranker = self.reranker_model
        print(await ctx.get("query", default=None), flush=True)
        new_nodes = ranker.postprocess_nodes(
            ev.nodes, query_str=await ctx.get("query", default=None)
        )
        print(f"Reranked nodes to {len(new_nodes)}")
        return RerankEvent(nodes=new_nodes)

    @step
    async def synthesize(self, ctx: Context, ev: RerankEvent) -> StopEvent:
        """Return a streaming response using reranked nodes."""
        # llm = OpenAI(model="gpt-4o-mini")
        # summarizer = CompactAndRefine(llm=llm, streaming=True, verbose=True)
        summarizer = CompactAndRefine(llm=llm, streaming=True, verbose=True)
        query = await ctx.get("query", default=None)

        response = await summarizer.asynthesize(query, nodes=ev.nodes)

        # response = None
        print(ev.nodes)
        print("---")
        print(ev.nodes[0].text)
        print("---------------")
        print(ev.nodes[1].text)
        return StopEvent(result=response)

### Data Ingestion

In [6]:
w = RAGWorkflow(
    llm=llm,
    embed_model=embed_model,
    reranker_model=reranker_model
)

# Ingest the documents
index = await w.run(dirname="../data/budget_statement_annex")

In [10]:
index.docstore

<llama_index.core.storage.docstore.simple_docstore.SimpleDocumentStore at 0x1f7e6360310>

In [13]:
for key in index.__dict__:
    print(key)

_use_async
_store_nodes_override
_embed_model
_insert_batch_size
_storage_context
_docstore
_show_progress
_vector_store
_graph_store
_callback_manager
_object_map
_index_struct
_transformations


In [11]:
retriever = index.as_retriever(similarity_top_k=2)
retriever

<llama_index.core.indices.vector_store.retrievers.retriever.VectorIndexRetriever at 0x1f7e5382a90>

In [12]:
for key in retriever.__dict__:
    print(key)

_index
_vector_store
_embed_model
_docstore
_similarity_top_k
_vector_store_query_mode
_alpha
_node_ids
_doc_ids
_filters
_sparse_top_k
_hybrid_top_k
_kwargs
callback_manager
object_map
_verbose


In [22]:
len(retriever._vector_store.data.embedding_dict)

64

### Querying

In [15]:
# Run a query
result = await w.run(query="Am I eligible for the Majulah Package?")
async for chunk in result.async_response_gen():
    print(chunk, end="", flush=True)

Query the database with: Am I eligible for the Majulah Package?
Retrieved 2 nodes.
Am I eligible for the Majulah Package?


pre tokenize:   0%|          | 0/1 [00:00<?, ?it/s]

You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 179.57it/s]


Reranked nodes to 2
[NodeWithScore(node=TextNode(id_='f1e29b71-bfdb-47de-be1e-2353a90652c9', embedding=None, metadata={'page_label': '1', 'file_name': 'annexf2.pdf', 'file_path': 'c:\\Users\\weilu\\Desktop\\AI Codes\\budget-rag\\data\\budget_statement_annex\\annexf2.pdf', 'file_type': 'application/pdf', 'file_size': 131302, 'creation_date': '2024-11-20', 'last_modified_date': '2024-11-20'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='e1cd33f8-7676-43a0-9ada-542fa8b04ca6', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '1', 'file_name': 'annexf2.pdf', 'file_path': 'c:\\Users\\weilu\\Desktop\\AI Codes\\budget-rag\\data\\budget_statement_annex\\annexf2.pdf', 'file_type': 'application/pdf', '

In [16]:
# Run a query
result = await w.run(query="Am I eligible for the Earn and Save Bonus in the Majulah Package?")
async for chunk in result.async_response_gen():
    print(chunk, end="", flush=True)

Query the database with: Am I eligible for the Earn and Save Bonus in the Majulah Package?
Retrieved 2 nodes.
Am I eligible for the Earn and Save Bonus in the Majulah Package?


pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 325.37it/s]


Reranked nodes to 2
[NodeWithScore(node=TextNode(id_='f1e29b71-bfdb-47de-be1e-2353a90652c9', embedding=None, metadata={'page_label': '1', 'file_name': 'annexf2.pdf', 'file_path': 'c:\\Users\\weilu\\Desktop\\AI Codes\\budget-rag\\data\\budget_statement_annex\\annexf2.pdf', 'file_type': 'application/pdf', 'file_size': 131302, 'creation_date': '2024-11-20', 'last_modified_date': '2024-11-20'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='e1cd33f8-7676-43a0-9ada-542fa8b04ca6', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '1', 'file_name': 'annexf2.pdf', 'file_path': 'c:\\Users\\weilu\\Desktop\\AI Codes\\budget-rag\\data\\budget_statement_annex\\annexf2.pdf', 'file_type': 'application/pdf', '

In [17]:
# Run a query
result = await w.run(query="Am I eligible for the MediSave Bonus in the Majulah Package?")
async for chunk in result.async_response_gen():
    print(chunk, end="", flush=True)

Query the database with: Am I eligible for the MediSave Bonus in the Majulah Package?
Retrieved 2 nodes.
Am I eligible for the MediSave Bonus in the Majulah Package?


pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 498.97it/s]


Reranked nodes to 2
[NodeWithScore(node=TextNode(id_='ff844bfa-b38c-4105-817d-8fef9dced7e6', embedding=None, metadata={'page_label': '1', 'file_name': 'annexf3.pdf', 'file_path': 'c:\\Users\\weilu\\Desktop\\AI Codes\\budget-rag\\data\\budget_statement_annex\\annexf3.pdf', 'file_type': 'application/pdf', 'file_size': 72890, 'creation_date': '2024-11-20', 'last_modified_date': '2024-11-20'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='7f84bef4-e231-42f5-8922-9927d6c6dbee', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '1', 'file_name': 'annexf3.pdf', 'file_path': 'c:\\Users\\weilu\\Desktop\\AI Codes\\budget-rag\\data\\budget_statement_annex\\annexf3.pdf', 'file_type': 'application/pdf', 'f

In [None]:
# Run a query
result = await w.run(query="How much MediSave Bonus can I get from the Majulah Package?")
async for chunk in result.async_response_gen():
    print(chunk, end="", flush=True)

Query the database with: Am I eligible for the MediSave Bonus in the Majulah Package?
Retrieved 2 nodes.
Am I eligible for the MediSave Bonus in the Majulah Package?


pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 399.99it/s]


Reranked nodes to 2
[NodeWithScore(node=TextNode(id_='ff844bfa-b38c-4105-817d-8fef9dced7e6', embedding=None, metadata={'page_label': '1', 'file_name': 'annexf3.pdf', 'file_path': 'c:\\Users\\weilu\\Desktop\\AI Codes\\budget-rag\\data\\budget_statement_annex\\annexf3.pdf', 'file_type': 'application/pdf', 'file_size': 72890, 'creation_date': '2024-11-20', 'last_modified_date': '2024-11-20'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='7f84bef4-e231-42f5-8922-9927d6c6dbee', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '1', 'file_name': 'annexf3.pdf', 'file_path': 'c:\\Users\\weilu\\Desktop\\AI Codes\\budget-rag\\data\\budget_statement_annex\\annexf3.pdf', 'file_type': 'application/pdf', 'f

In [None]:
# Ingest the documents
w_speech = RAGWorkflow(
    llm=llm,
    embed_model=embed_model,
    reranker_model=reranker_model
)
await w_speech.run(dirname="../data/budget_statement_and_speech")

# Run a query
result = await w_speech.run(query="What are the Key reasons for high inflation over the last two years?")
print("------------------------------")
async for chunk in result.async_response_gen():
    print(chunk, end="", flush=True)

Query the database with: What are the Key reasons for high inflation over the last two years?
Retrieved 2 nodes.
What are the Key reasons for high inflation over the last two years?


pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 499.68it/s]


Reranked nodes to 2
[NodeWithScore(node=TextNode(id_='7f0c1b43-6ed2-49c7-bedd-76aa0497d49e', embedding=None, metadata={'page_label': '3', 'file_name': 'fy2024_budget_debate_round_up_speech.pdf', 'file_path': 'c:\\Users\\weilu\\Desktop\\AI Codes\\budget-rag\\data\\statement_and_speech\\fy2024_budget_debate_round_up_speech.pdf', 'file_type': 'application/pdf', 'file_size': 483783, 'creation_date': '2024-11-20', 'last_modified_date': '2024-11-20'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='df8e62de-6922-4a0d-88e7-a0690b2f0adc', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '3', 'file_name': 'fy2024_budget_debate_round_up_speech.pdf', 'file_path': 'c:\\Users\\weilu\\Desktop\\AI Codes\\budg

In [None]:
# Ingest the documents
w_booklet = RAGWorkflow(
    llm=llm,
    embed_model=embed_model,
    reranker_model=reranker_model
)
await w_booklet.run(dirname="../data/booklet")

# Run a query
result = await w_booklet.run(query="What are the payouts I can expect to receive in December 2024?")
print("------------------------------")
async for chunk in result.async_response_gen():
    print(chunk, end="", flush=True)

Query the database with: What are the payouts I can expect to receive in December 2024?
Retrieved 2 nodes.
What are the payouts I can expect to receive in December 2024?


pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 876.00it/s]


Reranked nodes to 2
[NodeWithScore(node=TextNode(id_='312dfc48-ebc6-4790-98d3-e36ff317fdd3', embedding=None, metadata={'page_label': '9', 'file_name': 'fy2024_infographic_budget_booklet_english.pdf', 'file_path': 'c:\\Users\\weilu\\Desktop\\AI Codes\\budget-rag\\data\\booklet\\fy2024_infographic_budget_booklet_english.pdf', 'file_type': 'application/pdf', 'file_size': 5987411, 'creation_date': '2024-11-20', 'last_modified_date': '2024-11-20'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='21568ed7-ceac-439e-8e65-ce8ed1181099', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '9', 'file_name': 'fy2024_infographic_budget_booklet_english.pdf', 'file_path': 'c:\\Users\\weilu\\Desktop\\AI Codes\\b