# Install Packages and Setup Variables


In [1]:
!pip install -q google-generativeai==0.8.5 google-genai==1.27.0 llama-index-llms-google-genai==0.3.0 llama-index==0.13.0 openai==1.92.0 jedi==0.19.2

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/753.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m753.4/753.4 kB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m303.3/303.3 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/41.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.2/313.2 kB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m47.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

In [2]:
import os
import time
from IPython.display import Markdown, display

# Set the following API Keys in the Python environment. Will be used later.
# We use OpenAI for the embedding model and Gemini-2.5-flash as our LLM.

os.environ["OPENAI_API_KEY"] = "<YOUR_OPENAI_KEY>"
os.environ["GOOGLE_API_KEY"] = "<YOUR_API_KEY>"

# from google.colab import userdata
# os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
# os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')

# Load Dataset


## Download


The dataset includes a subset of the documentation from the Llama-index library.


In [3]:
!curl -L -o ./llama_index_150k.jsonl https://huggingface.co/datasets/towardsai-buster/llama-index-docs/raw/main/llama_index_data_150k.jsonl

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   115  100   115    0     0   1034      0 --:--:-- --:--:-- --:--:--  1026
100  570k  100  570k    0     0  2809k      0 --:--:-- --:--:-- --:--:-- 2809k


## Read File and create LlamaIndex Documents


In [4]:
from llama_index.core import Document
import json


def create_docs(input_file: str) -> list[Document]:
    documents = []
    with open(input_file, "r") as f:
        for idx, line in enumerate(f, start=1):

          data = json.loads(line)

          required_keys = {"doc_id", "content", "url", "name", "tokens", "source"}
          if not required_keys.issubset(data):
              print(f"Missing keys in line {idx}: {required_keys - set(data)}")
              continue

          documents.append(
              Document(
                  doc_id=data["doc_id"],
                  text=data["content"],
                  metadata={  # type: ignore
                      "url": data["url"],
                      "title": data["name"],
                      "tokens": data["tokens"],
                      "source": data["source"],
                  },
                  excluded_llm_metadata_keys=[
                      "title",
                      "tokens",
                      "source",
                  ],
                  excluded_embed_metadata_keys=[
                      "url",
                      "tokens",
                      "source",
                  ],
              )
          )

    return documents


# Convert the texts to Document objects.
documents = create_docs("llama_index_150k.jsonl")
print(f"Number of documents: {len(documents)}")


Number of documents: 56


# Generate Embedding


In [5]:
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding


# Build index / generate embeddings using OpenAI embedding model
index = VectorStoreIndex.from_documents(
    documents,
    embed_model=OpenAIEmbedding(model="text-embedding-3-small"),
    transformations=[SentenceSplitter(chunk_size=512, chunk_overlap=128)],
    show_progress=True,
)

Parsing nodes:   0%|          | 0/56 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/447 [00:00<?, ?it/s]

# Query Dataset


In [6]:
# Define a query engine that is responsible for retrieving related pieces of text,
# and using a LLM to formulate the final answer.

from llama_index.llms.google_genai import GoogleGenAI

llm = GoogleGenAI(model="models/gemini-2.5-flash", temperature=1)

query_engine = index.as_query_engine(llm=llm, similarity_top_k=5)

In [7]:
start = time.time()

response = query_engine.query("How to setup a query engine in code?")

end = time.time()

display(Markdown(response.response))
print("time taken: ", end - start)

A query engine can be set up in several ways:

1.  **From an existing index**: The simplest method is to create a query engine directly from an index using `index.as_query_engine()`.
2.  **For SQL tables**: If you know the specific tables you want to query, you can use `NLSQLTableQueryEngine` by providing an `SQLDatabase` object and a list of table names.
3.  **For unknown tables (Table Index)**: When the tables are not known beforehand or the schema is too large, you can build an `ObjectIndex` (e.g., a `VectorStoreIndex` of `SQLTableSchema` objects) and then construct a `SQLTableRetrieverQueryEngine` using a retriever from this object index and the `SQLDatabase`.
4.  **With custom stages (RetrieverQueryEngine)**: For granular control, you can assemble a `RetrieverQueryEngine` by configuring its components: a `retriever` (e.g., `VectorIndexRetriever`), a `response_synthesizer`, and optional `node_postprocessors` like `SimilarityPostprocessor`.
5.  **For multi-document queries (SubQuestionQueryEngine)**: To handle queries across multiple data sources or documents, you can define an index for each source, wrap them with `QueryEngineTool` objects, and then create a `SubQuestionQueryEngine` using these tools.

time taken:  3.817213773727417


In [8]:
start = time.time()

response = query_engine.query("How to setup an agent in code?")

end = time.time()

display(Markdown(response.response))
print("time taken: ", end - start)

To set up an agent in code, begin by importing the necessary components such as `ReActAgent`, your chosen Large Language Model (LLM) like `OpenAI`, and `FunctionTool`. Ensure environment variables are loaded if needed.

Next, define the tools the agent will use. These are typically Python functions, like those for multiplication or addition, that are then wrapped into `FunctionTool` objects. The docstrings of these functions serve as metadata, informing the agent about each tool's purpose.

After defining tools, initialize your LLM. For example, you can use `OpenAI(model="gpt-3.5-turbo", temperature=0)`. Other models accessible via API or local models like Mixtral via Ollama can also be used.

Finally, initialize the agent itself. A `ReActAgent` can be created by providing it with an array of the defined tools and the initialized LLM, optionally setting `verbose=True` to observe its internal process.

time taken:  3.2998409271240234


# Setup Long Context Caching


For this section, we will be using the Gemini API


In [9]:
from google import genai
from google.genai import types

client = genai.Client(api_key=userdata.get('GOOGLE_API_KEY'))

In [10]:
import json


def create_text_file(input_file: str, output_file: str) -> None:
    with open(input_file, "r") as f, open(output_file, "w") as out:
        for line in f:
            data = json.loads(line)
            out.write(data["content"] + "\n\n")  # Add two newlines between documents

    print(f"Contents saved to {output_file}")


create_text_file("llama_index_150k.jsonl", "llama_index_contents.txt")

Contents saved to llama_index_contents.txt


In [12]:
import datetime

# Update the cache's time-to-live (ttl)

ttl = f"{int(datetime.timedelta(minutes=10).total_seconds())}s"

document = client.files.upload(file="llama_index_contents.txt")

model_name = "models/gemini-2.0-flash-001"

cache = client.caches.create(
    model=model_name,
    config=types.CreateCachedContentConfig(
        contents=[document],
        system_instruction="You answer questions about the LlamaIndex framework.",
        ttl=ttl,
    ),
)

# # To Update the cache

# # Alternatively, you update the expire_time directly
# # Update the expire_time directly in valid RFC 3339 format (UTC with a "Z" suffix)

# expire_time = (
#     (
#         datetime.datetime.now(datetime.timezone.utc)
#         + datetime.timedelta(minutes=15)
#     )
#     .isoformat()
#     .replace("+00:00", "Z")
# )

# client.caches.update(
#              name=cache.name,
#             config=types.UpdateCachedContentConfig(expire_time=expire_time),
# )

# # To delete cache

# client.caches.delete(name=cache.name)

In [17]:
# Use the cache for generation
import time
start = time.time()

response = client.models.generate_content(
    model=model_name,
    contents="What is LlamaParse, How to setup? Explain detail",
    config=types.GenerateContentConfig(cached_content=cache.name),
)

end = time.time()
display(Markdown(response.text))
print("time taken: ", end - start)

LlamaParse is a state-of-the-art document parsing solution developed by the LlamaIndex team. It excels at accurately extracting text and data from complex documents, including PDFs with intricate layouts like tables and multi-column text.

Here's a detailed explanation and setup guide:

**What is LlamaParse?**

*   **Document Parsing Excellence:** LlamaParse is designed to overcome the challenges of parsing documents, especially PDFs. It handles complex layouts, tables, and other formatting intricacies to provide a clean and structured representation of your document's content.

*   **Powered by LlamaCloud:** LlamaParse is offered as part of LlamaCloud, an end-to-end managed service for data parsing, ingestion, indexing, and retrieval. This allows for production-quality data extraction.

*   **Self-Serve API:** Besides being part of LlamaCloud, LlamaParse is also available as a self-serve API, offering flexibility for different use cases.

**How to Set Up LlamaParse**

There are two primary ways to set up and use LlamaParse: through LlamaCloud or as a self-serve API.

**1. Using LlamaCloud (Managed Service)**

   *   **Sign Up:** Register for a LlamaCloud account at [https://cloud.llamaindex.ai/](https://cloud.llamaindex.ai/).
   *   **API Key:** Upon signing up, you'll receive a LlamaCloud API key. Store this key securely.
   *   **Integration:**  Use the LlamaCloud API key to access LlamaParse within the LlamaIndex framework.

**2. Using the Self-Serve API**

   *   **Sign Up:**  Register for a LlamaCloud account at [https://cloud.llamaindex.ai/](https://cloud.llamaindex.ai/). This is needed to obtain an API Key.
   *   **API Key:** Store the API key.
   *   **Install `llama-parse`:** Make sure you have the `llama-parse` python package installed.

        ```bash
        pip install llama-parse
        ```

   *   **Code Example:** Use the `LlamaParse` class directly in your code:

        ```python
        from llama_parse import LlamaParse

        # Initialize LlamaParse with your LlamaCloud API key
        parser = LlamaParse(api_key="YOUR_LLAMA_CLOUD_API_KEY", result_type="markdown")

        # Load data from a PDF file
        documents = parser.load_data("./path/to/your/document.pdf")

        # The documents variable now contains the parsed content in Markdown format.
        ```

**Important Considerations:**

*   **API Key Security:** Protect your LlamaCloud API key. Do not expose it in public repositories or client-side code. Store it as an environment variable.
*   **Free Usage:** LlamaCloud offers a free tier that allows parsing up to 1000 pages per day. For higher usage, you may need to add a credit card to your account.
*   **Asynchronous Parsing:**  Parsing large documents can take time. LlamaParse might operate asynchronously, meaning you'll submit the document and retrieve the results later.

**Example in context of general LlamaIndex RAG pipeline:**

```python
from llama_index.core import VectorStoreIndex
from llama_index.core import SimpleDirectoryReader, Settings
from llama_index.llms.openai import OpenAI
from llama_parse import LlamaParse

# 1. Configure LLM
Settings.llm = OpenAI(model="gpt-3.5-turbo", temperature=0)

# 2. Load documents with LlamaParse
documents = LlamaParse(result_type="markdown").load_data(
    "./data/2023_canadian_budget.pdf"
)

# 3. Build index
index = VectorStoreIndex.from_documents(documents)

# 4. Setup query engine
query_engine = index.as_query_engine()

# 5. Query index
response = query_engine.query(
    "How much exactly was allocated to a tax credit to promote investment in green technologies in the 2023 Canadian federal budget?"
)
print(response)
```

time taken:  8.745552778244019


In [18]:
response.usage_metadata

GenerateContentResponseUsageMetadata(
  cache_tokens_details=[
    ModalityTokenCount(
      modality=<MediaModality.TEXT: 'TEXT'>,
      token_count=212635
    ),
  ],
  cached_content_token_count=212635,
  candidates_token_count=908,
  candidates_tokens_details=[
    ModalityTokenCount(
      modality=<MediaModality.TEXT: 'TEXT'>,
      token_count=908
    ),
  ],
  prompt_token_count=212646,
  prompt_tokens_details=[
    ModalityTokenCount(
      modality=<MediaModality.TEXT: 'TEXT'>,
      token_count=212646
    ),
  ],
  total_token_count=213554
)

## First token response time in Straming

In [16]:
start = time.time()

response = client.models.generate_content(
    model=model_name,
    contents="How to setup a Router query engine?",
    config=types.GenerateContentConfig(cached_content=cache.name,max_output_tokens=1),
)
end = time.time()
display(Markdown(response.text))
print("time taken: ", end - start)

Based

time taken:  3.675197124481201
