In [1]:
from helper import get_openai_api_key
OPENAI_API_KEY = get_openai_api_key()


In [2]:
import nest_asyncio
nest_asyncio.apply()

In [4]:
#!wget "https://openreview.net/pdf?id=VtmBAGCN7o" -O metagpt.pdf

--2024-05-11 04:14:16--  https://openreview.net/pdf?id=VtmBAGCN7o
Resolving openreview.net (openreview.net)... 35.184.86.251
Connecting to openreview.net (openreview.net)|35.184.86.251|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16911937 (16M) [application/pdf]
Saving to: ‘metagpt.pdf’


2024-05-11 04:14:18 (22.4 MB/s) - ‘metagpt.pdf’ saved [16911937/16911937]



## Load Data 

In [3]:
from llama_index.core import SimpleDirectoryReader

#Load docs
#file = "metagpt.pdf"
file = "table.pdf"
documents = SimpleDirectoryReader(input_files = [file]).load_data()

## Define LLM and embedding model

In [4]:
from llama_index.core.node_parser import SentenceSplitter
splitter = SentenceSplitter(chunk_size = 1024)
nodes = splitter.get_nodes_from_documents(documents)

In [5]:
from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

Settings.llm = OpenAI(model = "gpt-3.5-turbo")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-ada-002")

## Define Summary and vector index on the same data

In [6]:
from llama_index.core import SummaryIndex, VectorStoreIndex

summary_index = SummaryIndex(nodes)
vector_index = VectorStoreIndex(nodes)


## Define Query engines and set metadata

In [7]:
summary_query_engine = summary_index.as_query_engine(
    response_mode = "tree_summarize",
    use_async=True,
)

vector_query_engine = vector_index.as_query_engine()


In [10]:
# from llama_index.core.tools import QueryEngineTool

# summary_tool = QueryEngineTool.from_defaults(
#     query_engine=summary_query_engine,
#     description = (
#         "Useful for summarization questions related to MetaGPT"
#     ),
# )
# vector_tool = QueryEngineTool.from_defaults(
#     query_engine = vector_query_engine,
#     description = (
#         "Useful for retrieving specific context from the MetaGPT paper."
#     ),
# )

In [8]:
from llama_index.core.tools import QueryEngineTool

summary_tool = QueryEngineTool.from_defaults(
    query_engine=summary_query_engine,
    description = (
        "Useful for summarization questions related to disability"
    ),
)
vector_tool = QueryEngineTool.from_defaults(
    query_engine = vector_query_engine,
    description = (
        "Useful for retrieving specific context from the disability table."
    ),
)

## Define Router query engine

In [9]:
from llama_index.core.query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector

query_engine = RouterQueryEngine(
    selector = LLMSingleSelector.from_defaults(),
    query_engine_tools=[
        summary_tool,
        vector_tool,
    ],
    verbose=True
)

In [10]:
response = query_engine.query("What is the summary of the document?")
print(str(response))

[1;3;38;5;200mSelecting query engine 0: The document is likely focused on summarization questions related to disability..
[0mThe document contains an example table displaying data related to different disability categories, including the number of participants, ballots completed, incomplete/terminated ballots, results accuracy, and time taken to complete tasks for each category. The disability categories mentioned are Blind, Low Vision, Dexterity, and Mobility. The table provides specific data points for each category, such as the number of participants, ballots completed, incomplete/terminated ballots, results accuracy in percentage with corresponding sample sizes, and the time taken to complete tasks in seconds with corresponding sample sizes.


In [11]:
print(len(response.source_nodes))

1


In [18]:
# response = query_engine.query(
#     "how do agents share information with other agents?")
# print(str(response))

[1;3;38;5;200mSelecting query engine 1: This choice is more relevant as it specifically mentions retrieving specific context from the MetaGPT paper, which would likely include information on how agents share information..
[0mAgents share information with other agents by utilizing a shared message pool where they can publish structured messages. This shared message pool allows all agents to exchange messages directly, enabling them to both publish their own messages and access messages from other agents transparently. Agents can retrieve required information directly from this shared pool, eliminating the need to inquire about other agents and wait for their responses, thus enhancing communication efficiency.


In [12]:
response = query_engine.query(
    "tell me info about blind disability")
print(str(response))

[1;3;38;5;200mSelecting query engine 0: The first choice is most relevant as it specifically mentions summarization questions related to disability, which would likely include information about blind disability..
[0mThe blind disability category had 5 participants, with 1 completing all ballots and 4 ballots being incomplete or terminated. The results accuracy for this category was 34.5% based on one completed ballot. The average time taken to complete the tasks for this category was 1199 seconds.


## Eveyrthing together

In [19]:
# from utils import get_router_query_engine
# query_engine = get_router_query_engine("metagpt.pdf")


In [13]:
from utils import get_router_query_engine
query_engine = get_router_query_engine(file)

In [14]:
response = query_engine.query("Tell me about the total number of participants")
print(str(response))

[1;3;38;5;200mSelecting query engine 1: The total number of participants is specific context that can be retrieved from the MetaGPT paper..
[0mThe total number of participants across all disability categories is 18.
