### Custom MBA courses RAG project

In [1]:
# Go one level up in the directories hierarchy to access src directory and codes
import sys
import os
# Add project root to Python path
project_root = os.path.abspath("..")  # go one level up from notebooks/
sys.path.append(project_root)

In [2]:
# Setup necessary models for routing, chatting and embedding
from llama_index.llms.google_genai import GoogleGenAI
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from core.config.config import Config
from google.genai import types

router_llm = GoogleGenAI(
    model = Config.ROUTER_LLM,
    api_key = Config.GOOGLE_API_KEY,
    generation_config = types.GenerateContentConfig(
        thinking_config = types.ThinkingConfig(thinking_budget = 0),
        temperature = Config.LLM_TEMPERATURE,
    ),
    max_tokens = Config.LLM_MAX_TOKENS
)

embed_model = HuggingFaceEmbedding(
    model_name = Config.EMBEDDING_MODEL
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load the collections name and description from my custom JSON file
import json

docs_path = "../documents"
collections_mba_json = docs_path + "/collections_mba.json"

with open(collections_mba_json, "r", encoding="utf-8") as file:
    COLLECTIONS_MBA = json.load(file)

for collections_name, collection_description in COLLECTIONS_MBA.items():
    COLLECTIONS_MBA[collections_name] = (" \n ").join([line.strip() for line in collection_description.splitlines()[1:-2]])

In [4]:
# Let's build our own Router Retriever based on the my personal pdf database for my MBA courses
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.tools import RetrieverTool
from core.config.config import Config


# Initialize the retriever_tools list to create a list of RetrieverTool objects that we will later
# pass into the LLMMultiSelector for selecting an appropriate retriever
retriever_tools = []

# I manually wrote a dictionary for each course and its decription inside the previously loaded JSON file. 
# 'collection_name' matches the name of the course folder inside the documents folder
for collection_name, collection_description in COLLECTIONS_MBA.items():

    collection_path = docs_path + '/' + collection_name

    # 1) Read documents and create list of 'Document' objects, that has id_, metadata, text attributes.
    #    Document class (generic container for any data source) is a subclass of the TextNode class
    collection_documents = SimpleDirectoryReader(input_dir = collection_path).load_data()

    # 2) Read each of this document objects and create index from it
    #    Document objects are parsed into Node objects that have different attributes such as text, embeddings, metadata, relationships.
    #    Document objects are split into multiple nodes (relationships between these nodes are recorded in Node objects as attributes).
    collection_index = VectorStoreIndex.from_documents(
        documents = collection_documents,
        embed_model = embed_model,
        show_progress = True
    )

    # 3) Then we create a retriever from each of those indices that were built on top of those collections of Document objects
    #    To do it, we just call the as_retriever method of the VectorStoreIndex object
    #    We also indicate the similarity_top
    collection_retriever = collection_index.as_retriever(similarity_top_k = Config.SIMILARITY_TOP_K)

    # 4) We wrap those collection retrievers inside the RetrieverTool so that the MultiSelector will be able to select an
    #    appropriate retriever based on its decription
    collection_retriever_tool = RetrieverTool.from_defaults(
        retriever = collection_retriever,
        description = collection_description
    )

    # 5) Append created RetrieverTool for each collection to the list initialized before this loop
    retriever_tools.append(collection_retriever_tool)


Parsing nodes: 100%|██████████| 337/337 [00:00<00:00, 3516.83it/s]
Generating embeddings: 100%|██████████| 337/337 [00:42<00:00,  7.95it/s]
Parsing nodes: 100%|██████████| 205/205 [00:03<00:00, 53.60it/s]
Generating embeddings: 100%|██████████| 217/217 [00:49<00:00,  4.35it/s]
Parsing nodes: 100%|██████████| 203/203 [00:00<00:00, 3059.23it/s]
Generating embeddings: 100%|██████████| 209/209 [00:24<00:00,  8.57it/s]
Parsing nodes: 100%|██████████| 306/306 [00:00<00:00, 3846.67it/s]
Generating embeddings: 100%|██████████| 306/306 [00:30<00:00, 10.04it/s]
Parsing nodes: 100%|██████████| 292/292 [00:00<00:00, 4323.64it/s]
Generating embeddings: 100%|██████████| 292/292 [00:19<00:00, 15.31it/s]
Parsing nodes: 100%|██████████| 365/365 [00:00<00:00, 3531.44it/s]
Generating embeddings: 100%|██████████| 365/365 [00:45<00:00,  7.95it/s]


In [9]:
# Create a router from that list of RetrieverTool objects using an LLMMultiSelector
from llama_index.core.retrievers import RouterRetriever
from llama_index.core.selectors import LLMMultiSelector
from core.config.constants import RagConstants

router = RouterRetriever(
    selector = LLMMultiSelector.from_defaults(
        prompt_template_str = RagConstants.LLM_MULTI_SELECTOR_PROMPT,
        # Maximum number of retrievers to retain - each retriever retrieves nodes from each corresponding colleciton
        max_outputs = Config.ROUTER_RETRIEVER_MAX_OUTPUTS,
        llm = router_llm
    ),
    llm = router_llm,
    retriever_tools = retriever_tools
)

In [11]:
import nest_asyncio
nest_asyncio.apply()

# Now let's try prompting our RAG to see the retrieved nodes
# According to our settings, we should expect no more than 15 nodes

user_query = "What is the difference between managerial and financial accounting?"
retriever_nodes = router.retrieve(user_query)

2025-12-13 14:56:18,813 - INFO - AFC is enabled with max remote calls: 10.
2025-12-13 14:56:19,820 - INFO - HTTP Request: POST https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-lite:generateContent "HTTP/1.1 200 OK"
2025-12-13 14:56:19,824 - INFO - Selecting retriever 0: This choice explicitly discusses financial accounting, its purpose, and the information it provides to stakeholders. While it doesn't directly contrast it with managerial accounting, it lays the groundwork for understanding financial accounting, which is a necessary step to identify the difference..


In [23]:
retriever_nodes[0].text

'Two types of accounting exist: financial and managerial accounting.\nWhat Types of Accounting Exist?A) Financial accounting: •Prepared underexternal rules (IFRS, US-GAAP)•Audited (KPMG, EY, Deloitte, PwC, etc.)•Shareholders, creditors, tax authorities, labour unions, employees•Monetary units (e.g., Tenge, USD, etc.)•“External” accounting\n17\nChapter1\nB) Managerial accounting: •Prepared under internal guidelines•Not audited•Managers (e.g., c-suite, CEO, CFO, etc.)•Monetary and non-monetary units (e.g., hours per product)•“Internal” accounting\nFinancial statementsBudget report, etc.'