# Building Agentic RAG with Llamaindex
source: https://learn.deeplearning.ai/courses/building-agentic-rag-with-llamaindex/lesson/1/introduction

In [None]:
import os
import nest_asyncio

from dotenv import load_dotenv
from pathlib import Path
from typing import List, Optional

from llama_index.core import Settings, SimpleDirectoryReader, SummaryIndex, VectorStoreIndex
from llama_index.core.agent import FunctionCallingAgentWorker, AgentRunner
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.objects import ObjectIndex
from llama_index.core.tools import FunctionTool, QueryEngineTool
from llama_index.core.vector_stores import MetadataFilters, FilterCondition
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.llms.openai import OpenAI


from utils import display_text
from utils import filenames_in_directory

<span style="color: blue; font-size:30px;">Setup</span>

In [None]:
# necessary for running in Jupyter Notebook 
nest_asyncio.apply()

In [None]:
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [None]:
llm = OpenAI(model="gpt-3.5-turbo", temperature=0)
embed_model = OpenAIEmbedding(model="text-embedding-3-large")

Settings.llm = llm
Settings.embed_model = embed_model

<span style="color:blue; font-size:30px">LlamaIndex RAG functions</span>

In [None]:
def load_documents(file_path: str):
    """Load documents from the specified file path."""
    documents = SimpleDirectoryReader(input_files=[file_path]).load_data()
    return documents

def create_nodes(documents):
    """Create nodes from loaded documents."""
    splitter = SentenceSplitter(chunk_size=1024)
    nodes = splitter.get_nodes_from_documents(documents)
    return nodes

def create_vector_index(nodes):
    """Create a vector index from nodes."""
    vector_index = VectorStoreIndex(nodes)
    return vector_index

def vector_query(
    query: str, 
    vector_index: VectorStoreIndex, 
    page_numbers: Optional[List[str]] = None
) -> str:
    """Use to answer questions over a given paper.

    Useful if you have specific questions over the paper.
    Always leave page_numbers as None UNLESS there is a specific page you want to search for.

    Args:
        query (str): the string query to be embedded.
        page_numbers (Optional[List[str]]): Filter by set of pages. Leave as NONE 
            if we want to perform a vector search
            over all pages. Otherwise, filter by the set of specified pages.
    
    """
    
    page_numbers = page_numbers or []
    metadata_dicts = [{"key": "page_label", "value": p} for p in page_numbers]
    
    query_engine = vector_index.as_query_engine(
        similarity_top_k=2,
        filters=MetadataFilters.from_dicts(metadata_dicts, condition=FilterCondition.OR)
    )
    response = query_engine.query(query)
    return response

def create_vector_query_tool(name: str, vector_index: VectorStoreIndex):
    """Create a vector query tool."""
    return FunctionTool.from_defaults(
        name=f"vector_tool_{name}",
        fn=lambda query, page_numbers=None: vector_query(query, vector_index, page_numbers)
    )

def create_summary_index(nodes):
    """Create a summary index from nodes."""
    summary_index = SummaryIndex(nodes)
    return summary_index

def create_summary_tool(name: str, summary_index: SummaryIndex):
    """Create a summary tool."""
    summary_query_engine = summary_index.as_query_engine(
        response_mode="tree_summarize",
        use_async=True,
    )
    return QueryEngineTool.from_defaults(
        name=f"summary_tool_{name}",
        query_engine=summary_query_engine,
        description=(
            "Use ONLY IF you want to get a holistic summary related to {name} "
        ),
    )

<span style="color:blue; font-size:30px">Load Data into RAG</span>

In [None]:
file_path = "./input_docs"
papers = filenames_in_directory(file_path)
papers

In [None]:
def process_papers(papers, file_path):
    """
    Process a list of papers and create tools for each paper.

    Args:
        papers (list): A list of paper filenames.
        file_path (str): The path to the directory containing the papers.

    Returns:
        dict: A dictionary mapping each paper to its associated tools.
    """
    paper_to_tools_dict = {}
    
    for paper in papers:
        name = f"{Path(paper).stem}"
        print(f"Getting tools for paper: {paper}")
        
        # Assuming the following functions are defined elsewhere
        documents = load_documents(f"{file_path}/{Path(paper)}")
        nodes = create_nodes(documents)
        vector_index = create_vector_index(nodes)
        vector_query_tool = create_vector_query_tool(name, vector_index)
        summary_index = create_summary_index(nodes)
        summary_tool = create_summary_tool(name, summary_index)
        
        paper_to_tools_dict[paper] = [vector_query_tool, summary_tool]
    
    return paper_to_tools_dict

paper_to_tools_dict = process_papers(papers, file_path)


In [None]:
# Flatten the list of tools from all papers into a single list `all_tools`
all_tools = [t for paper in papers for t in paper_to_tools_dict[paper]]
print(f"number of tools: {len(all_tools)}") #check (should be 2 * number of papers)

In [None]:
# Define an "object" index and retriever over these tools
obj_index = ObjectIndex.from_objects(
    all_tools,
    index_cls=VectorStoreIndex,
)

obj_retriever = obj_index.as_retriever(similarity_top_k=3)

In [None]:
# Initialize the FunctionCallingAgentWorker
# Verbose shows output of the agent's actions.

agent_worker = FunctionCallingAgentWorker.from_tools(
    tool_retriever=obj_retriever,
    llm=llm, 
    system_prompt=""" \
You are an agent designed to answer queries over a set of given papers.
Please always use the tools provided to answer a question. Do not rely on prior knowledge.\

""",
    verbose=True
)
agent = AgentRunner(agent_worker)

<span style="color:blue; font-size:30px">Try it</span>

<span style="color:blue; font-size:20px">example</span>

In [None]:
response = agent.query(
    "Question goes here",
)

In [None]:
print(display_text(response.response))

In [None]:
len(response.source_nodes)

In [None]:
print(response.source_nodes[0].get_content(metadata_mode="all"))

In [None]:
print(response.source_nodes[1].get_content(metadata_mode="all"))

In [None]:
# not certain that this working
agent.reset()

<span style="color:blue; font-size:20px">example</span>

<span style="color:blue; font-size:20px">example</span>

<span style="color:blue; font-size:20px">example</span>

<span style="color:blue; font-size:20px">example</span>

<span style="color:blue; font-size:20px">example</span>

<span style="color:blue; font-size:20px">example</span>

<span style="color:blue; font-size:20px">example</span>