In [23]:
from helper import get_openai_api_key
OPENAI_API_KEY = get_openai_api_key()

In [24]:
import nest_asyncio
nest_asyncio.apply()

In [25]:
urls = [
    "https://openreview.net/pdf?id=VtmBAGCN7o",
    "https://openreview.net/pdf?id=6PmJoRfdaK",
    "https://openreview.net/pdf?id=hSyW5go0v8",
    "https://openreview.net/pdf?id=VTF8yNQM66",
    "https://openreview.net/pdf?id=hSyW5go0v8"
]

papers = [
    "metagpt.pdf",
    "longlora.pdf",
    "selfrag.pdf",
    "loftq.pdf",
    "swebench.pdf",
    "selfrag.pdf"
]

In [26]:
from utils import get_doc_tools
from pathlib import Path

paper_to_tools_dict = {}
for paper in papers:
    print(f"Getting tools for paper: {paper}")
    vector_tool, summary_tool = get_doc_tools(paper, Path(paper).stem)
    paper_to_tools_dict[paper] = [vector_tool, summary_tool]

Getting tools for paper: metagpt.pdf
Getting tools for paper: longlora.pdf
Getting tools for paper: selfrag.pdf
Getting tools for paper: loftq.pdf
Getting tools for paper: swebench.pdf
Getting tools for paper: selfrag.pdf


In [27]:
all_tools = [t for paper in papers for t in paper_to_tools_dict[paper]]

In [28]:
from llama_index.llms.openai import OpenAI

llm = OpenAI(model="gpt-3.5-turbo")

In [29]:
len(all_tools)

12

In [30]:
# define an "object" index and retriever over these tools
from llama_index.core import VectorStoreIndex
from llama_index.core.objects import ObjectIndex

obj_index = ObjectIndex.from_objects(
    all_tools,
    index_cls=VectorStoreIndex,
)

In [31]:
obj_retriever = obj_index.as_retriever(similarity_top_k=3)

In [32]:
tools = obj_retriever.retrieve(
    "Tell me about the eval dataset used in MetaGPT and SWE-Bench"
)

In [33]:
tools[2].metadata

ToolMetadata(description='Useful for summarization questions related to selfrag', name='summary_tool_selfrag', fn_schema=<class 'llama_index.core.tools.types.DefaultToolFnSchema'>, return_direct=False)

In [34]:
from llama_index.core.agent import FunctionCallingAgentWorker
from llama_index.core.agent import AgentRunner

agent_worker = FunctionCallingAgentWorker.from_tools(
    tool_retriever=obj_retriever,
    llm=llm, 
    system_prompt=""" \
You are an agent designed to answer queries over a set of given papers.
Please always use the tools provided to answer a question. Do not rely on prior knowledge.\

""",
    verbose=True
)
agent = AgentRunner(agent_worker)

In [35]:
response = agent.query(
    "Tell me about the evaluation dataset used in LongLoRA, "
    "and then tell me about the evaluation results"
)

Added user message to memory: Tell me about the evaluation dataset used in LongLoRA, and then tell me about the evaluation results
=== Calling Function ===
Calling function: summary_tool_longlora with args: {"input": "evaluation dataset used in LongLoRA"}
=== Function Output ===
The evaluation dataset used in LongLoRA includes the book corpus dataset PG19, the cleaned Arxiv Math proof-pile dataset, the PG19 validation set from the Redpajama dataset, the PG19 test split, LongAlpaca-12k containing long-context and short question-answer pairs from the original Alpaca data, and the dataset called SAFECONV.
=== Calling Function ===
Calling function: summary_tool_longlora with args: {"input": "evaluation results"}
=== Function Output ===
The evaluation results consistently demonstrate the effectiveness and efficiency of the proposed methods in extending the context window for Llama2 models, achieving comparable or better performance compared to full attention or full fine-tuning baselines. T

In [36]:
response = agent.query(
    "Tell me about the evaluation dataset used "
    "in MetaGPT and compare it against SWE-Bench"
)

Added user message to memory: Tell me about the evaluation dataset used in MetaGPT and compare it against SWE-Bench
=== Calling Function ===
Calling function: summary_tool_metagpt with args: {"input": "evaluation dataset used in MetaGPT"}
=== Function Output ===
The evaluation dataset used in MetaGPT includes HumanEval, MBPP, and SoftwareDev.
=== Calling Function ===
Calling function: summary_tool_swebench with args: {"input": "evaluation dataset used in SWE-Bench"}
=== Function Output ===
The evaluation dataset used in SWE-Bench consists of software engineering task instances collected from real GitHub repositories, including issues and corresponding pull requests. It includes task instructions, issue text, retrieved files and documentation, example patch files, and prompts for generating patch files. The dataset is constructed by scraping pull requests from popular Python repositories, ensuring that the selected PRs are merged, resolve issues, and introduce new tests. Task instances 