In [2]:

from llama_index.core import (
    VectorStoreIndex,
    SimpleKeywordTableIndex,
    SimpleDirectoryReader,
)
from llama_index.core import SummaryIndex
from llama_index.core.schema import IndexNode
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.callbacks import CallbackManager
import nest_asyncio
nest_asyncio.apply()
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader
import os 
from pathlib import Path
from llama_index.embeddings.llamafile import LlamafileEmbedding
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.tools import FunctionTool
from llama_index.llms.ollama import Ollama
from llama_index.core.agent import ReActAgent
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import load_index_from_storage, StorageContext

In [11]:
Settings.llm = Ollama(model="llama3.1:latest", request_timeout=120.0)
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
parser = LlamaParse(
    api_key="llx-",
    result_type="markdown",
    verbose=True,
)
node_parser = SentenceSplitter()

In [20]:
import ollama
response = ollama.chat(model='llama3.1:latest', messages=[
  {
    'role': 'user',
    'content': 'Why is the sky blue?',
  },
])
print(response['message']['content'])

The sky appears blue to us during the daytime because of a phenomenon called scattering. Here's what happens:

1.  **Sunlight and Atmosphere:** When sunlight enters Earth's atmosphere, it encounters tiny molecules of gases such as nitrogen (N2) and oxygen (O2). These molecules are much smaller than the wavelength of light.

2.  **Scattering of Light:** The shorter wavelengths of light, like blue and violet, scatter off these gas molecules in all directions. This is known as Rayleigh scattering, named after the British physicist Lord Rayleigh who first described it. Blue light is scattered more than red light because its wavelength is smaller, so there are more opportunities for scattering to occur.

3.  **Why We See a Blue Sky:** When we look up at the sky on a clear day, we see this scattered blue light coming from all directions. The Earth's atmosphere acts as a prism, dispersing the sunlight and making it appear blue. The amount of scattering that occurs depends on the wavelength of

In [19]:
print(response.text)

{"model":"llama3.1:latest","created_at":"2024-09-25T09:49:50.909305694Z","message":{"role":"assistant","content":"Hello"},"done":false}
{"model":"llama3.1:latest","created_at":"2024-09-25T09:49:50.920220804Z","message":{"role":"assistant","content":"!"},"done":false}
{"model":"llama3.1:latest","created_at":"2024-09-25T09:49:50.927173708Z","message":{"role":"assistant","content":" It"},"done":false}
{"model":"llama3.1:latest","created_at":"2024-09-25T09:49:50.934101803Z","message":{"role":"assistant","content":"'s"},"done":false}
{"model":"llama3.1:latest","created_at":"2024-09-25T09:49:50.941054889Z","message":{"role":"assistant","content":" nice"},"done":false}
{"model":"llama3.1:latest","created_at":"2024-09-25T09:49:50.948035346Z","message":{"role":"assistant","content":" to"},"done":false}
{"model":"llama3.1:latest","created_at":"2024-09-25T09:49:50.95489135Z","message":{"role":"assistant","content":" meet"},"done":false}
{"model":"llama3.1:latest","created_at":"2024-09-25T09:49:50

In [21]:



paper_titles = os.listdir('./data/pdf')
paper_titles = [title.split('.pdf')[0] for title in paper_titles if title.endswith('.pdf')]
city_docs = {}
for paper_title in paper_titles:
    city_docs[paper_title] = parser.load_data(f"./data/pdf/{paper_title}.pdf")

Started parsing the file under job_id 3204812d-9d35-4af8-8d94-164b3c9a2442
Started parsing the file under job_id e0e6203b-6923-45fb-bf41-1632dbcb7ab2
.Started parsing the file under job_id 81d5b2cd-1b76-4e10-9a88-e65e34318501
.Started parsing the file under job_id 1565237c-d40b-4c45-af64-6a8d92006131
Started parsing the file under job_id 832e6877-695c-4f23-b3f7-edf64b1e8d79
.Started parsing the file under job_id 205af550-8517-4705-af25-7812915da43f
Started parsing the file under job_id 8052a58d-935a-460e-b9ec-a2f322135d86
Error while parsing the file './data/pdf/ReDel: A Toolkit for LLM-Powered Recursive Multi-Agent Systems.pdf': 
Started parsing the file under job_id 5dddee1c-2723-49cc-991c-5737513b41d4
Started parsing the file under job_id 77741485-ca15-4653-98b4-5eba6e6ffee8
Started parsing the file under job_id 6ff5ab27-1c0f-4818-bcbf-84da3d506e8b
..........Started parsing the file under job_id b6dd4548-34a4-4e79-9203-57b104190ed1
Started parsing the file under job_id 8357df31-a10a

In [23]:
paper_index = {}
for paper_title in paper_titles:

    # build index
    paper_index[paper_title] = VectorStoreIndex.from_documents(city_docs[paper_title])

    # persist index
    paper_index[paper_title].storage_context.persist(persist_dir=f"./storage/{paper_title}")

In [24]:



# Build agents dictionary
agents = {}
query_engines = {}

# this is for the baseline
all_nodes = []


for idx, paper_title in enumerate(paper_titles):
    nodes = node_parser.get_nodes_from_documents(city_docs[paper_title])
    all_nodes.extend(nodes)

    if not os.path.exists(f"./data/{paper_title}"):
        # build vector index
        vector_index = VectorStoreIndex(nodes)
        vector_index.storage_context.persist(
            persist_dir=f"./data/{paper_title}"
        )
    else:
        vector_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=f"./data/{paper_title}"),
        )

    # build summary index
    summary_index = SummaryIndex(nodes)
    
    # define query engines
    vector_query_engine = vector_index.as_query_engine(llm=Settings.llm)
    summary_query_engine = summary_index.as_query_engine(llm=Settings.llm)

    # define tools
    query_engine_tools = [
        QueryEngineTool(
            query_engine=vector_query_engine,
            metadata=ToolMetadata(
                name="vector_tool",
                description = (
        f"Useful for answering questions about the academic paper titled '{paper_title}'. "
        "This tool can provide information on various aspects of the paper, including but not limited to:"
        "\n- The main research question or hypothesis"
        "\n- Methodology and experimental design"
        "\n- Key findings and results"
        "\n- Theoretical framework and background"
        "\n- Implications and conclusions"
        "\n- Related work and literature review"
        "\n- Limitations and future research directions"
        "\nUse a specific question about the paper as input to this tool."
                ),
            ),
        ),
        QueryEngineTool(
            query_engine=summary_query_engine,
            metadata=ToolMetadata(
                name="summary_tool",
                description=(
                    "Useful for any requests that require a holistic summary"
                    f" of EVERYTHING about {paper_title}. For questions about"
                    " more specific sections, please use the vector_tool."
                ),
            ),
        ),
    ]

    # build agent
    function_llm = Ollama(model="llama3:latest", request_timeout=120.0)
    agent = ReActAgent.from_tools(
        
        query_engine_tools,
        max_iterations=100,
        llm=function_llm,
        verbose=True,
        system_prompt=f"""\
You are a specialized agent designed to answer queries about {paper_title}.
You must ALWAYS use at least one of the tools provided when answering a question; do NOT rely on prior knowledge.\
""",
    )

    agents[paper_title] = agent
    query_engines[paper_title] = vector_index.as_query_engine(
        similarity_top_k=2
    )


In [2]:
import os
os.listdir('storage')

['lyft',
 'A Study on the Implementation Method of an Agent-Based Advanced RAG System Using Graph',
 'Exploring Advanced Large Language Models with LLMsuite',
 'Toolformer: Language Models Can Teach Themselves to Use Tools',
 'Learning Transferable Visual Models From Natural Language Supervision',
 'ReDel: A Toolkit for LLM-Powered Recursive Multi-Agent Systems',
 'Optimizing RAG Techniques for Automotive Industry PDF Chatbots: A Case Study with Locally Deployed Ollama Models',
 'ColPali: Efficient Document Retrieval with Vision Language Models',
 'LoRA: Low-Rank Adaptation of Large Language Models',
 'Graph Retrieval-Augmented Generation: A Survey',
 'uber',
 'Deep Time Series Models: A Comprehensive Survey and Benchmark']

In [22]:
# define tool for each document agent
all_tools = []
for paper_title in paper_titles:
    paper_summary = (
        f"This content contains paper articles about {paper_title}. Use"
        f" this tool if you want to answer any questions about {paper_title}.\n"
    )
    doc_tool = QueryEngineTool(
        query_engine=agents[paper_title],
        metadata=ToolMetadata(
            name=f"tool_{paper_title}",
            description=paper_summary,
        ),
    )
    all_tools.append(doc_tool)

In [23]:
# define an "object" index and retriever over these tools
from llama_index.core import VectorStoreIndex
from llama_index.core.objects import ObjectIndex

obj_index = ObjectIndex.from_objects(
    all_tools,
    index_cls=VectorStoreIndex,
)

top_agent = ReActAgent.from_tools(
    max_iterations=100,
    tool_retriever=obj_index.as_retriever(similarity_top_k=3),
    system_prompt=""" \
You are an agent designed to answer queries about a set of given paper.
Please always use the tools provided to answer a question. Do not rely on prior knowledge.\

""",
    verbose=True,
)


In [24]:
response = top_agent.query("Tell me about ReDel: A Toolkit for LLM-Powered Recursive Multi-Agent Systems")

> Running step fef80232-14d6-49a2-b4d6-1f0c056bc84d. Step input: Tell me about ReDel: A Toolkit for LLM-Powered Recursive Multi-Agent Systems
[1;3;38;5;200mThought: The current language of the user is: English. I need to use a tool to help me answer the question.
Action: tool_ReDel
Action Input: {'input': 'ReDel: A Toolkit for LLM-Powered Recursive Multi-Agent Systems', 'num_beams': 5}
[0m[1;3;34mObservation: Error: No such tool named `tool_ReDel`.
[0m> Running step d16350ac-b131-4792-b4a1-19c7d65ee9cf. Step input: None
[1;3;38;5;200mThought: The user pointed out that there is no such tool as `tool_ReDel`. I need to use a different tool.
Action: tool_A
Action Input: {'input': 'ReDel: A Toolkit for LLM-Powered Recursive Multi-Agent Systems', 'num_beams': 5}
[0m[1;3;34mObservation: Error: No such tool named `tool_A`.
[0m> Running step 1f2c156b-b5c5-4b5f-bd51-91afe1659555. Step input: None
[1;3;38;5;200mThought: The user pointed out that there is no such tool as `tool_A`. I need 

In [18]:

base_index = load_index_from_storage(
            StorageContext.from_defaults(persist_dir=f"./storage/all_datas"),
        )

#base_index.storage_context.persist(persist_dir=f"./storage/all_datas")

In [24]:
base_query_engine = base_index.as_query_engine(similarity_top_k=4)
response = base_query_engine.query("Tell me detail about ColPali")

In [25]:
response = base_query_engine.query("I wnat to know convex hull's solution")

In [26]:
response.response

"To find the convex hull of a set of points, you first need to sort them in either clockwise or counterclockwise order. This can be done using comparison-based sorting algorithms like quicksort or mergesort. However, if you have a large number of points and memory is a concern, you might want to consider an efficient sorting algorithm that doesn't require extra space.\n\nAfter sorting the points, you'll need to iterate through them to build the convex hull. The key insight here is to keep track of the upper and lower points as you go, and to remove any point that lies below the line formed by the two most recently added points.\n\nOne efficient way to implement this step is to use a sweep line algorithm or a stack-based approach. The basic idea behind these algorithms is to maintain a list of active points (i.e., those that are part of the convex hull) as you scan through all the points in order.\n\nIn more detail, when adding a new point to the convex hull, if it's left of the current

In [10]:
docs = parser.load_data('data/pdf/筆記.pdf')

Started parsing the file under job_id f02fed19-4046-401d-a213-a4aef473d9b9


In [22]:
nodes = node_parser.get_nodes_from_documents(docs)

In [23]:
base_index.insert_nodes(nodes)

In [3]:
from llama_index.core import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    StorageContext,
    load_index_from_storage,
)

Settings.llm = Ollama(model="llama3:latest", request_timeout=120.0)
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

from llama_index.core.tools import QueryEngineTool, ToolMetadata
try:
    storage_context = StorageContext.from_defaults(
        persist_dir="./storage/lyft"
    )
    lyft_index = load_index_from_storage(storage_context)

    storage_context = StorageContext.from_defaults(
        persist_dir="./storage/uber"
    )
    uber_index = load_index_from_storage(storage_context)

    index_loaded = True
except:
    index_loaded = False

if not index_loaded:
    # load data
    lyft_docs = SimpleDirectoryReader(
        input_files=["./data/10k/lyft_2021.pdf"]
    ).load_data()
    uber_docs = SimpleDirectoryReader(
        input_files=["./data/10k/uber_2021.pdf"]
    ).load_data()

    # build index
    lyft_index = VectorStoreIndex.from_documents(lyft_docs)
    uber_index = VectorStoreIndex.from_documents(uber_docs)

    # persist index
    lyft_index.storage_context.persist(persist_dir="./storage/lyft")
    uber_index.storage_context.persist(persist_dir="./storage/uber")

invalid pdf header: b'\xfd7zXZ'
incorrect startxref pointer(1)
invalid pdf header: b'\xfd7zXZ'
incorrect startxref pointer(1)


In [20]:
lyft_engine = lyft_index.as_query_engine(similarity_top_k=3)
uber_engine = uber_index.as_query_engine(similarity_top_k=3)
query_engine_tools = [
    QueryEngineTool(
        query_engine=lyft_engine,
        metadata=ToolMetadata(
            name="lyft_10k",
            description=(
                "Provides information about Lyft financials for year 2021. "
                "Use a detailed plain text question as input to the tool."
            ),
        ),
    ),
    QueryEngineTool(
        query_engine=uber_engine,
        metadata=ToolMetadata(
            name="uber_10k",
            description=(
                "Provides information about Uber financials for year 2021. "
                "Use a detailed plain text question as input to the tool."
            ),
        ),
    ),
]

In [21]:

from llama_index.core.agent import ReActAgent

agent = ReActAgent.from_tools(
    query_engine_tools,
    llm=Settings.llm,
    verbose=True,
    # context=context
)

In [22]:

response = agent.chat("What was Lyft's revenue growth in 2021?")
print(str(response))

> Running step e1b5553b-b1df-4aeb-a358-d8bba1264489. Step input: What was Lyft's revenue growth in 2021?
[1;3;38;5;200mThought: The current language of the user is English. I need to use a tool to help me answer the question.
Action: lyft_10k
Action Input: {'input': "Lyft's revenue growth in 2021"}
[0m[1;3;34mObservation: Lyft's revenue reached an all-time high in the three months ended December 31, 2021, increasing compared to the previous quarter. This was driven by an increase in ride frequency and a shift toward higher revenue rides, as well as revenues from licensing and data access agreements.
[0m> Running step 116365d0-e809-4e65-aee9-4b2b14b9b925. Step input: None
[1;3;38;5;200mThought: I can answer without using any more tools. I'll use the user's language to answer
Answer: Lyft experienced significant revenue growth in 2021, specifically in the last quarter of the year, due to increased ride frequency and a shift towards higher-revenue rides, as well as revenues from lice

In [20]:

paper_titles = os.listdir('./storage')
vector_index = None
for paper_title in paper_titles:
    temp_vector_index = load_index_from_storage(
        StorageContext.from_defaults(persist_dir=f"./storage/{paper_title}"),
    )
    temp_vector_index.node
    print(paper_title)
    print(temp_vector_index)
    if vector_index is None:
        vector_index = temp_vector_index
    else:
        vector_index.insert(temp_vector_index)


lyft
<llama_index.core.indices.vector_store.base.VectorStoreIndex object at 0x7451306659c0>
A Study on the Implementation Method of an Agent-Based Advanced RAG System Using Graph
<llama_index.core.indices.vector_store.base.VectorStoreIndex object at 0x745130665de0>


AttributeError: 'VectorStoreIndex' object has no attribute 'id_'

In [15]:
print(vector_index)

None
