### Custom MBA courses RAG project

In [1]:
# Go one level up in the directories hierarchy to access src directory and codes
import sys
import os
# Add project root to Python path
project_root = os.path.abspath("..")  # go one level up from notebooks/
sys.path.append(project_root)

In [2]:
# Setup necessary models for routing, chatting and embedding
from llama_index.llms.google_genai import GoogleGenAI
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from core.config.config import Config
from google.genai import types

router_llm = GoogleGenAI(
    model = Config.ROUTER_LLM,
    api_key = Config.GOOGLE_API_KEY,
    generation_config = types.GenerateContentConfig(
        thinking_config = types.ThinkingConfig(thinking_budget = 0),
        temperature = Config.LLM_TEMPERATURE,
    ),
    max_tokens = Config.LLM_MAX_TOKENS
)

embed_model = HuggingFaceEmbedding(
    model_name = Config.EMBEDDING_MODEL
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load the collections name and description from my custom JSON file
import json

docs_path = "../documents"
collections_mba_json = docs_path + "/collections_mba.json"

with open(collections_mba_json, "r", encoding = "utf-8") as file:
    COLLECTIONS_MBA = json.load(file)

for collections_name, collection_description in COLLECTIONS_MBA.items():
    COLLECTIONS_MBA[collections_name] = (" \n ").join([line.strip() for line in collection_description.splitlines()[1:-2]])

In [4]:
# Let's build our own Router Retriever based on the my personal pdf database for my MBA courses
# Here we perform ingestion step along with building multiple indices, retrievers on top of indices
# and creating a RouterRetriever from those retrievers (which are wrapped inside RetrieverTool objects)
from core.config.constants import RagConstants
from core.config.config import Config

from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.tools import RetrieverTool
from llama_index.core.retrievers import RouterRetriever
from llama_index.core.selectors import LLMMultiSelector


# Initialize the retriever_tools list to create a list of RetrieverTool objects that we will later
# pass into the LLMMultiSelector for selecting an appropriate retriever
retriever_tools = []

# I manually wrote a dictionary for each course and its decription inside the previously loaded JSON file. 
# 'collection_name' matches the name of the course folder inside the documents folder
for collection_name, collection_description in COLLECTIONS_MBA.items():

    collection_path = docs_path + '/' + collection_name

    # 1) Read documents and create list of 'Document' objects, that has id_, metadata, text attributes.
    #    Document class (generic container for any data source) is a subclass of the TextNode class
    collection_documents = SimpleDirectoryReader(input_dir = collection_path).load_data()

    # 2) Read each of this document objects and create index from it
    #    Document objects are parsed into Node objects that have different attributes such as text, embeddings, metadata, relationships.
    #    Document objects are split into multiple nodes (relationships between these nodes are recorded in Node objects as attributes).
    collection_index = VectorStoreIndex.from_documents(
        documents = collection_documents,
        embed_model = embed_model,
        show_progress = True
    )

    # 3) Then we create a retriever from each of those indices that were built on top of those collections of Document objects
    #    To do it, we just call the as_retriever method of the VectorStoreIndex object
    #    We also indicate the similarity_top
    collection_retriever = collection_index.as_retriever(similarity_top_k = Config.SIMILARITY_TOP_K)

    # 4) We wrap those collection retrievers inside the RetrieverTool so that the MultiSelector will be able to select an
    #    appropriate retriever based on its decription
    collection_retriever_tool = RetrieverTool.from_defaults(
        retriever = collection_retriever,
        description = collection_description
    )

    # 5) Append created RetrieverTool for each collection to the list initialized before this loop
    retriever_tools.append(collection_retriever_tool)

# Create a router from that list of RetrieverTool objects using an LLMMultiSelector for selecting relevant retrievers 
# based on a prompt
router = RouterRetriever(
    selector = LLMMultiSelector.from_defaults(
        prompt_template_str = RagConstants.LLM_MULTI_SELECTOR_PROMPT,
        # Maximum number of retrievers to retain - each retriever retrieves nodes from each corresponding colleciton
        max_outputs = Config.ROUTER_RETRIEVER_MAX_OUTPUTS,
        llm = router_llm
    ),
    llm = router_llm,
    retriever_tools = retriever_tools
)

Parsing nodes: 100%|██████████| 337/337 [00:00<00:00, 3434.87it/s]
Generating embeddings: 100%|██████████| 337/337 [00:41<00:00,  8.10it/s]
Parsing nodes: 100%|██████████| 205/205 [00:00<00:00, 599.41it/s]
Generating embeddings: 100%|██████████| 217/217 [00:43<00:00,  4.99it/s]
Parsing nodes: 100%|██████████| 203/203 [00:00<00:00, 2854.36it/s]
Generating embeddings: 100%|██████████| 209/209 [00:20<00:00, 10.02it/s]
Parsing nodes: 100%|██████████| 306/306 [00:00<00:00, 3886.05it/s]
Generating embeddings: 100%|██████████| 306/306 [00:32<00:00,  9.38it/s]
Parsing nodes: 100%|██████████| 292/292 [00:00<00:00, 2243.21it/s]
Generating embeddings: 100%|██████████| 292/292 [00:20<00:00, 14.54it/s]
Parsing nodes: 100%|██████████| 365/365 [00:00<00:00, 3335.76it/s]
Generating embeddings: 100%|██████████| 365/365 [00:42<00:00,  8.60it/s]


In [5]:
import nest_asyncio
nest_asyncio.apply()

# Now let's try prompting our RAG to see the retrieved nodes
# According to our settings, we should expect no more than 15 nodes

user_query = "What is the difference between managerial and financial accounting?"
retrieved_nodes = router.retrieve(user_query)

2025-12-13 21:39:02,829 - INFO - AFC is enabled with max remote calls: 10.
2025-12-13 21:39:03,974 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite:generateContent "HTTP/1.1 200 OK"
2025-12-13 21:39:03,983 - INFO - Selecting retriever 0: This choice explicitly states that the course is oriented towards users of financial accounting information rather than prepares, and focuses on understanding the effects of transactions on financial statements and evaluating a firm's financial position and performance. This implies a distinction between internal (managerial) and external (financial) accounting, as it focuses on the external reporting aspect..


In [17]:
# Now we need to check the relevance of the retrieved nodes and only derive those nodes 
# that are relevant to the user's query. 
# Are there better ways that just passing it to the LLM? Maybe thresholding? Or just pass it all as a context
# and ask LLM to see the relevance in the chat time?

# Create retrieved_nodes str for LLM to easier make a choice
retrieved_nodes_str = ""
for i in range(len(retrieved_nodes)):
    if i == len(retrieved_nodes) - 1: 
        str_to_add = "\"" + retrieved_nodes[i].id_ + "\": \"\"\"" + retrieved_nodes[i].text + "\"\"\""
    else:
        str_to_add = "\"" + retrieved_nodes[i].id_ + "\": \"\"\""  + retrieved_nodes[i].text + "\"\"\"\n"
    retrieved_nodes_str += str_to_add

print(retrieved_nodes_str)

# Check the relevance using the LLM call

relevance_check_prompt = RagConstants.LLM_RELEVANCE_CHECK_PROMPT.format(
    question = user_query,
    context = retrieved_nodes_str
)

response = await router_llm.acomplete(relevance_check_prompt)
print(response)

2025-12-13 21:57:48,250 - INFO - AFC is enabled with max remote calls: 10.


"087e48a3-7c52-467c-9ee8-1aa9f0c33833": """Two types of accounting exist: financial and managerial accounting.
What Types of Accounting Exist?A) Financial accounting: •Prepared underexternal rules (IFRS, US-GAAP)•Audited (KPMG, EY, Deloitte, PwC, etc.)•Shareholders, creditors, tax authorities, labour unions, employees•Monetary units (e.g., Tenge, USD, etc.)•“External” accounting
17
Chapter1
B) Managerial accounting: •Prepared under internal guidelines•Not audited•Managers (e.g., c-suite, CEO, CFO, etc.)•Monetary and non-monetary units (e.g., hours per product)•“Internal” accounting
Financial statementsBudget report, etc."""
"9d2f197d-9c65-4b56-a3dc-4aa55d0f645c": """3
(1) (2)
 (3)
What has this to do with accounting?
“It’s about story-telling”– Is it true for financial accounting?
Movie
(4)"""
"0a8c2762-b39c-452b-815f-00de06eef010": """Financial accounting produces reports that are prepared under rules to inform external parties.
What Is Financial Accounting?“It is the (1) information 

In [None]:
# Now, we derive the relevant context from the LLM response and construct the final
# context string that we will later use in the final LLM call to generate an answer to the user query
import re
import json

retrieved_nodes_dict = dict([(node.id_, node.text) for node in retrieved_nodes])

cleaned_relevant_node_ids = re.sub(r"^```(?:json)?\s*|\s*```$", "", response.text.strip())
relevant_node_ids = json.loads(cleaned_relevant_node_ids)

context = "\n\n".join(
    [retrieved_nodes_dict[node_id] for node_id in relevant_node_ids]
)
print(context)

Two types of accounting exist: financial and managerial accounting.
What Types of Accounting Exist?A) Financial accounting: •Prepared underexternal rules (IFRS, US-GAAP)•Audited (KPMG, EY, Deloitte, PwC, etc.)•Shareholders, creditors, tax authorities, labour unions, employees•Monetary units (e.g., Tenge, USD, etc.)•“External” accounting
17
Chapter1
B) Managerial accounting: •Prepared under internal guidelines•Not audited•Managers (e.g., c-suite, CEO, CFO, etc.)•Monetary and non-monetary units (e.g., hours per product)•“Internal” accounting
Financial statementsBudget report, etc.

Financial accounting produces reports that are prepared under rules to inform external parties.
What Is Financial Accounting?“It is the (1) information system that (2) measures business activities, (3) processes the information into reports, and (4) communicates the results to decision makers.”
(1) Information system: Generally Accepted Accounting Principles (GAAP) (e.g., US-GAAP, Indian-GAAP etc.)(2) Measurin