In [3]:
!pip install llama-index

Collecting llama-index
  Downloading llama_index-0.14.8-py3-none-any.whl.metadata (13 kB)
Collecting llama-index-core<0.15.0,>=0.14.8 (from llama-index)
  Downloading llama_index_core-0.14.8-py3-none-any.whl.metadata (2.5 kB)
Collecting llama-index-embeddings-openai<0.6,>=0.5.0 (from llama-index)
  Downloading llama_index_embeddings_openai-0.5.1-py3-none-any.whl.metadata (400 bytes)
Collecting llama-index-indices-managed-llama-cloud>=0.4.0 (from llama-index)
  Downloading llama_index_indices_managed_llama_cloud-0.9.4-py3-none-any.whl.metadata (3.7 kB)
Collecting llama-index-llms-openai<0.7,>=0.6.0 (from llama-index)
  Downloading llama_index_llms_openai-0.6.9-py3-none-any.whl.metadata (3.0 kB)
Collecting llama-index-readers-file<0.6,>=0.5.0 (from llama-index)
  Downloading llama_index_readers_file-0.5.4-py3-none-any.whl.metadata (5.7 kB)
Collecting llama-index-readers-llama-parse>=0.4.0 (from llama-index)
  Downloading llama_index_readers_llama_parse-0.5.1-py3-none-any.whl.metadata (3.

In [None]:
#Setting up LLM Provider

import os
os.environ["OPENAI_API_KEY"] = "" #Add your OpenAI API key here
import nest_asyncio

nest_asyncio.apply()

## Load data

Download the transformer paper - #!wget "https://proceedings.neurips.cc/paper_files/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf" -O transformer.pdf

In [None]:
# Configure Logging in LlamaIndex
import logging
import sys

# Method 1: Configure Python logging module (recommended)
logging.basicConfig(
    stream=sys.stdout,
    level=logging.INFO,  # Options: DEBUG, INFO, WARNING, ERROR, CRITICAL
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)

# Method 2: Set specific logger levels for llama-index components
# Get the llama_index logger
llama_logger = logging.getLogger("llama_index")
llama_logger.setLevel(logging.DEBUG)  # Set to DEBUG for more detailed logs

# Optional: Set level for specific components
# logging.getLogger("llama_index.core.embeddings").setLevel(logging.DEBUG)
# logging.getLogger("llama_index.core.llms").setLevel(logging.DEBUG)
# logging.getLogger("llama_index.core.agent").setLevel(logging.DEBUG)

# Method 3: Enable verbose mode in components (already using this in some places)
# verbose=True parameter in agents, query engines, etc.

print("Logging configured! You'll now see detailed logs from llama-index.")


In [6]:
from llama_index.core import SimpleDirectoryReader

# load documents
documents = SimpleDirectoryReader(input_files=["/Users/tanmaydhote/Downloads/NIPS-2017-attention-is-all-you-need-Paper.pdf"]).load_data()

## Define the LLM and Embedding Model
Discuss how to plug in models from Element Gateway here.
Supported providers - https://docs.llamaindex.ai/en/stable/module_guides/models/llms/modules/

We will have to plug in models from the Element Gateway here.

In [7]:
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

Settings.llm = OpenAI(model="gpt-4o")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-3-small")

  from .autonotebook import tqdm as notebook_tqdm


## Define Summary Index and Vector Index on the data

In [8]:
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(chunk_size=1024)
nodes = splitter.get_nodes_from_documents(documents)


from llama_index.core import SummaryIndex, VectorStoreIndex

summary_index = SummaryIndex(nodes)
vector_index = VectorStoreIndex(nodes)

2025-11-18 18:54:20,551 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


# Simple RAG

In [9]:
query_engine_simple = vector_index.as_query_engine(
    similarity_top_k=5,          # tweak how many chunks come back
    # any other kwargs…
)

resp = query_engine_simple.query("Tell me what self attention is and then tell me about the training data also")
print(resp)


2025-11-18 18:54:28,593 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-11-18 18:54:34,575 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Self-attention, also known as intra-attention, is an attention mechanism that relates different positions of a single sequence to compute a representation of that sequence. It allows the model to focus on different parts of the input sequence when computing a representation, which is particularly useful for capturing dependencies regardless of their distance in the sequence.

Regarding the training data, the model was trained on the WMT 2014 English-German dataset, which consists of about 4.5 million sentence pairs. Sentences were encoded using byte-pair encoding with a shared source-target vocabulary of about 37,000 tokens. For English-French, the significantly larger WMT 2014 English-French dataset was used, consisting of 36 million sentences, with tokens split into a 32,000 word-piece vocabulary. Sentence pairs were batched together by approximate sequence length, with each training batch containing approximately 25,000 source tokens and 25,000 target tokens.


# Agentic RAG - Going beyond simple retrieval and generation

## Define Query Engine and Tools

In [10]:
summary_query_engine = summary_index.as_query_engine(
    response_mode="tree_summarize",
    use_async=True,
)
vector_query_engine = vector_index.as_query_engine()

from llama_index.core.tools import QueryEngineTool


summary_tool = QueryEngineTool.from_defaults(
    query_engine=summary_query_engine,
    description=(
        "Useful for summarization questions related to the Transformer paper"
    ),
)

vector_tool = QueryEngineTool.from_defaults(
    query_engine=vector_query_engine,
    description=(
        "Useful for answering specific questions from the Transformer paper."
    ),
)

## Define Router Query Engine

In [11]:
from llama_index.core.query_engine.router_query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector


query_engine = RouterQueryEngine(
    selector=LLMSingleSelector.from_defaults(),
    query_engine_tools=[
        summary_tool,
        vector_tool,
    ],
    verbose=True
)

In [12]:
response = query_engine.query("What is the summary of the document?")
print(str(response))

2025-11-18 18:54:46,605 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-11-18 18:54:46,615 - INFO - Selecting query engine 0: The question asks for a summary of the document, which aligns with choice 1, as it is useful for summarization questions related to the Transformer paper..


[1;3;38;5;200mSelecting query engine 0: The question asks for a summary of the document, which aligns with choice 1, as it is useful for summarization questions related to the Transformer paper..
[0m

2025-11-18 18:54:53,479 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


The document introduces the Transformer, a novel neural network architecture for sequence transduction tasks, which relies entirely on attention mechanisms, eliminating the need for recurrent or convolutional networks. The Transformer model is shown to be more efficient and parallelizable, achieving superior performance in machine translation tasks compared to previous models. It establishes new state-of-the-art BLEU scores on the WMT 2014 English-to-German and English-to-French translation tasks. The architecture consists of an encoder-decoder structure using multi-head self-attention and feed-forward layers, allowing for significant parallelization and improved learning of long-range dependencies. The document also discusses the advantages of self-attention over traditional recurrent and convolutional layers, including computational efficiency and the ability to model dependencies regardless of their distance in the input or output sequences. The paper concludes with a discussion on 

In [13]:
print(len(response.source_nodes))

11


In [20]:
response = query_engine.query(
    "What is the training data?"
)
print(str(response))

print(len(response.source_nodes))

2025-11-19 09:13:08,783 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-11-19 09:13:08,807 - INFO - Selecting query engine 1: The question 'What is the training data?' is a specific question related to the Transformer paper, making choice 2 the most relevant..


[1;3;38;5;200mSelecting query engine 1: The question 'What is the training data?' is a specific question related to the Transformer paper, making choice 2 the most relevant..
[0m

2025-11-19 09:13:09,746 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-11-19 09:13:12,197 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


The training data consists of the WMT 2014 English-German dataset with about 4.5 million sentence pairs and the WMT 2014 English-French dataset with 36 million sentences. The English-German sentences were encoded using byte-pair encoding with a shared source-target vocabulary of about 37,000 tokens, while the English-French dataset used a 32,000 word-piece vocabulary.
2


## Adding Reasoning Loop to the Agent

In [16]:
from llama_index.core.agent import FunctionAgent
from llama_index.core.memory import ChatMemoryBuffer

agent = FunctionAgent(
    tools=[vector_tool, summary_tool],
    verbose=True,
)
chat_memory = ChatMemoryBuffer.from_defaults()

In [None]:
handler = agent.run(
    user_msg="Tell me what self attention is and then why is it important. Also what is the training data?",
    memory=chat_memory,
)
response = await handler

# Explore what's available in the response object
print("=" * 60)
print("RESPONSE CONTENT:")
print("=" * 60)
print(str(response))  # or response.response.content
print()

print("=" * 60)
print("RESPONSE OBJECT PROPERTIES:")
print("=" * 60)
print(f"Response type: {type(response)}")
print(f"Agent name: {response.current_agent_name}")
print(f"Response message: {response.response}")
print(f"Response content: {response.response.content}")
print("=" * 60)
print(f"Number of tool calls: {len(response.tool_calls)}")
if response.tool_calls:
    print(f"Tool calls: {response.tool_calls}")
    for i, tool_call in enumerate(response.tool_calls):
        print(f"  Tool {i+1}: {tool_call.tool_name} with args: {tool_call.tool_kwargs}")
        if hasattr(tool_call, "tool_output"):
            print(f"  Tool output: {tool_call.tool_output}")
print()

# Access the handler to see execution details
print("=" * 60)
"""
print("HANDLER INFORMATION:")
print("=" * 60)
print(f"Handler type: {type(handler)}")
print(f"Handler context available: {hasattr(handler, 'ctx')}")
print()


# You can also access memory to see the full conversation
print("=" * 60)
print("CONVERSATION HISTORY (from memory):")
print("=" * 60)
messages = await chat_memory.aget()
for i, msg in enumerate(messages):
    print(f"Message {i+1} ({msg.role}): {msg.content[:100]}...")
"""

2025-11-19 12:02:06,558 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-11-19 12:02:08,600 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-11-19 12:02:09,840 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-11-19 12:02:10,741 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


RESPONSE CONTENT:
**Self-Attention:**

Self-attention in the Transformer model is an attention mechanism that relates different positions of a single sequence to compute a representation of the sequence. It allows the model to draw global dependencies between input and output without relying on sequence-aligned recurrence or convolution.

**Importance:**

This mechanism is important because it enables the model to process all positions in the sequence simultaneously, which significantly enhances parallelization and computational efficiency. Additionally, self-attention helps in learning long-range dependencies more effectively, as it connects all positions with a constant number of sequential operations, reducing the path length between dependencies. This makes it particularly advantageous for tasks like machine translation, where understanding the context across the entire sequence is crucial.

**Training Data:**

The Transformer model is trained on the WMT 2014 English-German dataset

'\n# You can also access memory to see the full conversation\nprint("=" * 60)\nprint("CONVERSATION HISTORY (from memory):")\nprint("=" * 60)\nmessages = await chat_memory.aget()\nfor i, msg in enumerate(messages):\n    print(f"Message {i+1} ({msg.role}): {msg.content[:100]}...")\n'

In [33]:
handler = agent.run(
    user_msg="""Tell me what self attention is
    and then why is it important.""",
    memory=chat_memory,
    verbose=True
)
response = await handler
print(str(response))

2025-11-19 11:55:30,431 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-11-19 11:55:34,209 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-11-19 11:55:36,820 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


**Self-Attention:**

Self-attention in the Transformer model is an attention mechanism that relates different positions of a single sequence to compute a representation of the sequence. It allows the model to draw global dependencies between input and output without relying on sequence-aligned recurrence or convolution. 

**Importance:**

This mechanism is crucial because it enables the model to process all positions in the sequence simultaneously, which significantly enhances parallelization and computational efficiency. Self-attention also facilitates learning long-range dependencies by connecting all positions with a constant number of operations, making it easier to capture relationships between distant elements in the sequence. This contributes to the model's superior performance in tasks like machine translation.


In [18]:
handler = agent.run(
    user_msg="Tell me about the training data used.",
    memory=chat_memory,
)
response = await handler
print(str(response))

2025-11-18 19:02:40,132 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-11-18 19:02:43,183 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-11-18 19:02:43,689 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


The Transformer model is trained on the WMT 2014 English-German dataset, which consists of about 4.5 million sentence pairs, and the WMT 2014 English-French dataset, which consists of 36 million sentences. For the English-German dataset, byte-pair encoding is used with a shared source-target vocabulary of about 37,000 tokens, while the English-French dataset uses a 32,000 word-piece vocabulary.


In [19]:
handler = agent.run(
    user_msg="How was the batching done?",
    memory=chat_memory,
)
response = await handler
print(str(response))

2025-11-18 19:02:57,544 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-11-18 19:03:00,279 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-11-18 19:03:00,872 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In the Transformer model, batching is done by grouping sentence pairs together based on approximate sequence length. Each training batch contains a set of sentence pairs with approximately 25,000 source tokens and 25,000 target tokens. This approach helps in efficiently utilizing computational resources during training.


# Building a Multi-Document Agent

In [None]:
# TODO: abstract all of this into a function that takes in a PDF file name 

from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, SummaryIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.tools import FunctionTool, QueryEngineTool
from llama_index.core.vector_stores import MetadataFilters, FilterCondition
from typing import List, Optional


# def get_doc_tools(
#     file_path: str,
#     name: str,
# ) -> str:
#     """Get vector query and summary query tools from a document."""
    
#     # load documents
#     documents = SimpleDirectoryReader(input_files=[file_path]).load_data()
#     splitter = SentenceSplitter(chunk_size=1024)
#     nodes = splitter.get_nodes_from_documents(documents)
#     vector_index = VectorStoreIndex(nodes)

#     def vector_query(
#         query: str, 
#         filter_key_list: List[str],
#         filter_value_list: List[str]
#     ) -> str:
#         """Perform a vector search over an index.

#         query (str): the string query to be embedded.
#         filter_key_list (List[str]): A list of metadata filter field names
#             Must specify ['page_label'] or empty list. Please leave empty
#             if there are no explicit filters to specify.
#         filter_value_list (List[str]): List of metadata filter field values 
#             (corresponding to names specified in filter_key_list) 

#         """
#         metadata_dicts = [
#             {"key": k, "value": v} for k, v in zip(filter_key_list, filter_value_list)
#         ]

#         query_engine = vector_index.as_query_engine(
#             similarity_top_k=2,
#             filters=MetadataFilters.from_dicts(metadata_dicts)
#         )
#         response = query_engine.query(query)
#         return response

#     vector_query_tool = FunctionTool.from_defaults(
#         fn=vector_query,
#         name=f"vector_query_{name}"
#     )

#     summary_index = SummaryIndex(nodes)
#     summary_query_engine = summary_index.as_query_engine(
#         response_mode="tree_summarize",
#         use_async=True,
#     )
#     summary_tool = QueryEngineTool.from_defaults(
#         query_engine=summary_query_engine,
#         name=f"summary_query_{name}",
#         description=(
#             f"Useful for summarization questions related to {name}"
#         ),
#     )
#     return vector_query_tool, summary_tool



def get_doc_tools(
    file_path: str,
    name: str,
) -> str:
    """Get vector query and summary query tools from a document."""

    # load documents
    documents = SimpleDirectoryReader(input_files=[file_path]).load_data()
    splitter = SentenceSplitter(chunk_size=1024)
    nodes = splitter.get_nodes_from_documents(documents)
    vector_index = VectorStoreIndex(nodes)
    
    def vector_query(
        query: str,
    ) -> str:
        
        query_engine = vector_index.as_query_engine(
            similarity_top_k=2,
        )
        response = query_engine.query(query)
        return response
        
    
    vector_query_tool = FunctionTool.from_defaults(
        name=f"vector_tool_{name}",
        fn=vector_query
    )
    
    summary_index = SummaryIndex(nodes)
    summary_query_engine = summary_index.as_query_engine(
        response_mode="tree_summarize",
        use_async=True,
    )
    summary_tool = QueryEngineTool.from_defaults(
        name=f"summary_tool_{name}",
        query_engine=summary_query_engine,
        description=(
            f"Useful for summarization questions related to {name}"
        ),
    )

    return vector_query_tool, summary_tool

In [43]:
papers = [
    "/Users/tanmaydhote/Downloads/6283_Self_RAG_Learning_to_Retr.pdf",
    "/Users/tanmaydhote/Downloads/1602_LongLoRA_Efficient_Fine_t.pdf",
    "/Users/tanmaydhote/Downloads/5488_MetaGPT_Meta_Programming_.pdf",
]

In [44]:
from pathlib import Path

paper_to_tools_dict = {}
for paper in papers:
    print(f"Getting tools for paper: {paper}")
    vector_tool, summary_tool = get_doc_tools(paper, Path(paper).stem)
    paper_to_tools_dict[paper] = [vector_tool, summary_tool]

paper_to_tools_dict = {}
for paper in papers:
    print(f"Getting tools for paper: {paper}")
    vector_tool, summary_tool = get_doc_tools(paper, Path(paper).stem)
    paper_to_tools_dict[paper] = [vector_tool, summary_tool]

Getting tools for paper: /Users/tanmaydhote/Downloads/6283_Self_RAG_Learning_to_Retr.pdf


2025-11-19 16:13:42,136 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Getting tools for paper: /Users/tanmaydhote/Downloads/1602_LongLoRA_Efficient_Fine_t.pdf


2025-11-19 16:13:45,250 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Getting tools for paper: /Users/tanmaydhote/Downloads/5488_MetaGPT_Meta_Programming_.pdf


2025-11-19 16:13:46,729 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Getting tools for paper: /Users/tanmaydhote/Downloads/6283_Self_RAG_Learning_to_Retr.pdf


2025-11-19 16:13:49,254 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Getting tools for paper: /Users/tanmaydhote/Downloads/1602_LongLoRA_Efficient_Fine_t.pdf


2025-11-19 16:13:51,461 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Getting tools for paper: /Users/tanmaydhote/Downloads/5488_MetaGPT_Meta_Programming_.pdf


2025-11-19 16:13:53,799 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [45]:
initial_tools = [t for paper in papers for t in paper_to_tools_dict[paper]]
len(initial_tools)

6

In [47]:
from llama_index.core.agent import FunctionAgent
from llama_index.core.memory import ChatMemoryBuffer

agent = FunctionAgent(
    tools=initial_tools,
    verbose=True,
)
chat_memory = ChatMemoryBuffer.from_defaults()

In [48]:
handler = agent.run(
    user_msg="Tell me about the evaluation dataset used in LongLoRA, "
    "and then tell me about the evaluation results",
    memory=chat_memory,
)
response = await handler

# Explore what's available in the response object
print("=" * 60)
print("RESPONSE CONTENT:")
print("=" * 60)
print(str(response))  # or response.response.content
print()

print("=" * 60)
print("RESPONSE OBJECT PROPERTIES:")
print("=" * 60)
print(f"Response type: {type(response)}")
print(f"Agent name: {response.current_agent_name}")
print(f"Response message: {response.response}")
print(f"Response content: {response.response.content}")
print("=" * 60)
print(f"Number of tool calls: {len(response.tool_calls)}")
if response.tool_calls:
    print(f"Tool calls: {response.tool_calls}")
    for i, tool_call in enumerate(response.tool_calls):
        print(f"  Tool {i+1}: {tool_call.tool_name} with args: {tool_call.tool_kwargs}")
        if hasattr(tool_call, "tool_output"):
            print(f"  Tool output: {tool_call.tool_output}")
print()

# Access the handler to see execution details
print("=" * 60)
"""
print("HANDLER INFORMATION:")
print("=" * 60)
print(f"Handler type: {type(handler)}")
print(f"Handler context available: {hasattr(handler, 'ctx')}")
print()


# You can also access memory to see the full conversation
print("=" * 60)
print("CONVERSATION HISTORY (from memory):")
print("=" * 60)
messages = await chat_memory.aget()
for i, msg in enumerate(messages):
    print(f"Message {i+1} ({msg.role}): {msg.content[:100]}...")
"""

2025-11-19 16:20:03,573 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-11-19 16:20:14,895 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


RESPONSE CONTENT:
The evaluation dataset used in LongLoRA is the proof-pile test set.

Regarding the evaluation results, LongLoRA demonstrates strong empirical performance on various tasks using Llama2 models, ranging from 7B/13B to 70B. It effectively extends the context length of these models while maintaining their original architectures. Specifically, LongLoRA extends the Llama2 7B model from a 4k context to 100k and the Llama2 70B model to 32k on a single 8× A100 machine. It achieves this with up to 1.8× lower memory cost and improves the training speed by up to 1.8× compared to full fine-tuning. The evaluation on the proof-pile test set shows that LongLoRA closes the accuracy gap between conventional LoRA and full fine-tuning.

RESPONSE OBJECT PROPERTIES:
Response type: <class 'llama_index.core.agent.workflow.workflow_events.AgentOutput'>
Agent name: Agent
Response message: assistant: The evaluation dataset used in LongLoRA is the proof-pile test set.

Regarding the evaluation re

'\nprint("HANDLER INFORMATION:")\nprint("=" * 60)\nprint(f"Handler type: {type(handler)}")\nprint(f"Handler context available: {hasattr(handler, \'ctx\')}")\nprint()\n\n\n# You can also access memory to see the full conversation\nprint("=" * 60)\nprint("CONVERSATION HISTORY (from memory):")\nprint("=" * 60)\nmessages = await chat_memory.aget()\nfor i, msg in enumerate(messages):\n    print(f"Message {i+1} ({msg.role}): {msg.content[:100]}...")\n'

In [51]:
handler = agent.run(
    user_msg="Give me a summary of both Self-RAG and LongLoRA",
    memory=chat_memory,
)
response = await handler

# Explore what's available in the response object
print("=" * 60)
print("RESPONSE CONTENT:")
print("=" * 60)
print(str(response))  # or response.response.content
print()

print("=" * 60)
print("RESPONSE OBJECT PROPERTIES:")
print("=" * 60)
print(f"Response type: {type(response)}")
print(f"Agent name: {response.current_agent_name}")
print(f"Response message: {response.response}")
print(f"Response content: {response.response.content}")
print("=" * 60)
print(f"Number of tool calls: {len(response.tool_calls)}")
if response.tool_calls:
    print(f"Tool calls: {response.tool_calls}")
    for i, tool_call in enumerate(response.tool_calls):
        print(f"  Tool {i+1}: {tool_call.tool_name} with args: {tool_call.tool_kwargs}")
        if hasattr(tool_call, "tool_output"):
            print(f"  Tool output: {tool_call.tool_output}")
print()

# Access the handler to see execution details
print("=" * 60)
"""
print("HANDLER INFORMATION:")
print("=" * 60)
print(f"Handler type: {type(handler)}")
print(f"Handler context available: {hasattr(handler, 'ctx')}")
print()


# You can also access memory to see the full conversation
print("=" * 60)
print("CONVERSATION HISTORY (from memory):")
print("=" * 60)
messages = await chat_memory.aget()
for i, msg in enumerate(messages):
    print(f"Message {i+1} ({msg.role}): {msg.content[:100]}...")
"""

2025-11-19 16:22:37,104 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-11-19 16:22:42,848 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-11-19 16:22:45,357 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-11-19 16:22:46,065 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


RESPONSE CONTENT:
**Self-RAG (Self-Reflective Retrieval-Augmented Generation):**  
Self-RAG is a framework designed to enhance the quality and factual accuracy of large language models (LLMs) by incorporating retrieval and self-reflection. It trains an LLM to retrieve relevant passages on-demand and generate outputs while reflecting on the quality and factual support of these outputs using special tokens called reflection tokens. This approach allows the model to adaptively decide when retrieval is necessary and to critique its own outputs, improving overall generation quality, factuality, and citation accuracy. Self-RAG outperforms other models, including ChatGPT, in various tasks by providing more accurate and verifiable outputs.

**LongLoRA:**  
LongLoRA is an efficient fine-tuning approach designed to extend the context sizes of pre-trained large language models (LLMs) with minimal computational cost. It addresses the challenge of training LLMs with long context sizes, which is typ

'\nprint("HANDLER INFORMATION:")\nprint("=" * 60)\nprint(f"Handler type: {type(handler)}")\nprint(f"Handler context available: {hasattr(handler, \'ctx\')}")\nprint()\n\n\n# You can also access memory to see the full conversation\nprint("=" * 60)\nprint("CONVERSATION HISTORY (from memory):")\nprint("=" * 60)\nmessages = await chat_memory.aget()\nfor i, msg in enumerate(messages):\n    print(f"Message {i+1} ({msg.role}): {msg.content[:100]}...")\n'

In [55]:
handler = agent.run(
    user_msg="Compare the datasets used in Self-RAG and LongLoRA. Give the answer in one sentence.",
    memory=chat_memory,
)
response = await handler

# Explore what's available in the response object
print("=" * 60)
print("RESPONSE CONTENT:")
print("=" * 60)
print(str(response))  # or response.response.content
print()

print("=" * 60)
print("RESPONSE OBJECT PROPERTIES:")
print("=" * 60)
print(f"Response type: {type(response)}")
print(f"Agent name: {response.current_agent_name}")
print(f"Response message: {response.response}")
print(f"Response content: {response.response.content}")
print("=" * 60)
print(f"Number of tool calls: {len(response.tool_calls)}")
if response.tool_calls:
    print(f"Tool calls: {response.tool_calls}")
    for i, tool_call in enumerate(response.tool_calls):
        print(f"  Tool {i+1}: {tool_call.tool_name} with args: {tool_call.tool_kwargs}")
        if hasattr(tool_call, "tool_output"):
            print(f"  Tool output: {tool_call.tool_output}")
print()

# Access the handler to see execution details
print("=" * 60)
"""
print("HANDLER INFORMATION:")
print("=" * 60)
print(f"Handler type: {type(handler)}")
print(f"Handler context available: {hasattr(handler, 'ctx')}")
print()


# You can also access memory to see the full conversation
print("=" * 60)
print("CONVERSATION HISTORY (from memory):")
print("=" * 60)
messages = await chat_memory.aget()
for i, msg in enumerate(messages):
    print(f"Message {i+1} ({msg.role}): {msg.content[:100]}...")
"""

2025-11-19 16:35:43,145 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


RESPONSE CONTENT:
Self-RAG employs datasets for reasoning, long-form generation, and retrieval accuracy analysis, such as PubHealth and PopQA, whereas LongLoRA uses the LongAlpaca dataset for long instruction-following tasks in supervised fine-tuning.

RESPONSE OBJECT PROPERTIES:
Response type: <class 'llama_index.core.agent.workflow.workflow_events.AgentOutput'>
Agent name: Agent
Response message: assistant: Self-RAG employs datasets for reasoning, long-form generation, and retrieval accuracy analysis, such as PubHealth and PopQA, whereas LongLoRA uses the LongAlpaca dataset for long instruction-following tasks in supervised fine-tuning.
Response content: Self-RAG employs datasets for reasoning, long-form generation, and retrieval accuracy analysis, such as PubHealth and PopQA, whereas LongLoRA uses the LongAlpaca dataset for long instruction-following tasks in supervised fine-tuning.
Number of tool calls: 0



'\nprint("HANDLER INFORMATION:")\nprint("=" * 60)\nprint(f"Handler type: {type(handler)}")\nprint(f"Handler context available: {hasattr(handler, \'ctx\')}")\nprint()\n\n\n# You can also access memory to see the full conversation\nprint("=" * 60)\nprint("CONVERSATION HISTORY (from memory):")\nprint("=" * 60)\nmessages = await chat_memory.aget()\nfor i, msg in enumerate(messages):\n    print(f"Message {i+1} ({msg.role}): {msg.content[:100]}...")\n'