<a href="https://colab.research.google.com/github/towardsai/ai-tutor-rag-system/blob/main/notebooks/Long_Context_Caching_vs_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install Packages and Setup Variables


In [None]:
!pip install -q google-genai==1.35.0 llama-index-llms-google-genai==0.5.0 \
                llama-index==0.14.0 openai==1.107.0 jedi==0.19.2

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.9/11.9 MB[0m [31m101.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m303.3/303.3 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.1/56.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.9/63.9 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import time
from IPython.display import Markdown, display

# Set the following API Keys in the Python environment. Will be used later.
# We use OpenAI for the embedding model and Gemini-2.5-flash as our LLM.

# os.environ["OPENAI_API_KEY"] = "<YOUR_OPENAI_KEY>"
# os.environ["GOOGLE_API_KEY"] = "<YOUR_API_KEY>"

from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')
os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')

In [None]:
import nest_asyncio

nest_asyncio.apply()

# Load Dataset


## Download


The dataset includes a subset of the documentation from the Llama-index library.


In [None]:
!curl -L -o ./llama_index_150k.jsonl https://huggingface.co/datasets/towardsai-buster/llama-index-docs/raw/main/llama_index_data_150k.jsonl

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   115  100   115    0     0    495      0 --:--:-- --:--:-- --:--:--   497
100  570k  100  570k    0     0  1291k      0 --:--:-- --:--:-- --:--:-- 1291k


## Read File and create LlamaIndex Documents


In [None]:
from llama_index.core import Document
import json


def create_docs(input_file: str) -> list[Document]:
    documents = []
    with open(input_file, "r") as f:
        for idx, line in enumerate(f, start=1):

          data = json.loads(line)

          required_keys = {"doc_id", "content", "url", "name", "tokens", "source"}
          if not required_keys.issubset(data):
              print(f"Missing keys in line {idx}: {required_keys - set(data)}")
              continue

          documents.append(
              Document(
                  doc_id=data["doc_id"],
                  text=data["content"],
                  metadata={  # type: ignore
                      "url": data["url"],
                      "title": data["name"],
                      "tokens": data["tokens"],
                      "source": data["source"],
                  },
                  excluded_llm_metadata_keys=[
                      "title",
                      "tokens",
                      "source",
                  ],
                  excluded_embed_metadata_keys=[
                      "url",
                      "tokens",
                      "source",
                  ],
              )
          )

    return documents


# Convert the texts to Document objects.
documents = create_docs("llama_index_150k.jsonl")
print(f"Number of documents: {len(documents)}")


Number of documents: 56


# Generate Embedding


In [None]:
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding


# Build index / generate embeddings using OpenAI embedding model
index = VectorStoreIndex.from_documents(
    documents,
    embed_model=OpenAIEmbedding(model="text-embedding-3-small"),
    transformations=[SentenceSplitter(chunk_size=512, chunk_overlap=128)],
    show_progress=True,
)

Parsing nodes:   0%|          | 0/56 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/447 [00:00<?, ?it/s]

# Query Dataset


In [None]:
# Define a query engine that is responsible for retrieving related pieces of text,
# and using a LLM to formulate the final answer.

from llama_index.llms.google_genai import GoogleGenAI

llm = GoogleGenAI(model="models/gemini-2.5-flash", temperature=1)

query_engine = index.as_query_engine(llm=llm, similarity_top_k=5)

In [None]:
start = time.time()

response = query_engine.query("How to setup a query engine in code?")

end = time.time()

display(Markdown(response.response))
print("time taken: ", end - start)

A query engine can be set up in several ways, depending on the complexity and specific needs of the application.

The simplest approach is to have an index create a query engine directly:
```python
query_engine = index.as_query_engine()
```

For more granular control over the querying process, including retrieval, postprocessing, and response synthesis, a `RetrieverQueryEngine` can be assembled:
```python
from llama_index.core import VectorStoreIndex, get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

# build index (assuming 'documents' are already loaded)
index = VectorStoreIndex.from_documents(documents)

# configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=10,
)

# configure response synthesizer
response_synthesizer = get_response_synthesizer()

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],
)
```

For natural language SQL queries against known tables, the `NLSQLTableQueryEngine` can be used:
```python
from llama_index import SQLDatabase
from llama_index.indices.struct_store.sql_query import NLSQLTableQueryEngine

sql_database = SQLDatabase(engine) # 'engine' needs to be defined
query_engine = NLSQLTableQueryEngine(
    sql_database=sql_database,
    tables=["github_issues", "github_comments", "github_users"],
)
```

For multi-document queries or those requiring sub-questions against different data sources, the `SubQuestionQueryEngine` can be defined with a list of `QueryEngineTool` objects:
```python
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.query_engine import SubQuestionQueryEngine

# Assuming sept_engine, june_engine, march_engine are pre-defined query engines
query_engine_tools = [
    QueryEngineTool(
        query_engine=sept_engine,
        metadata=ToolMetadata(name="sept_22", description="..."),
    ),
    QueryEngineTool(
        query_engine=june_engine,
        metadata=ToolMetadata(name="june_22", description="..."),
    ),
    QueryEngineTool(
        query_engine=march_engine,
        metadata=ToolMetadata(name="march_22", description="..."),
    ),
]

query_engine = SubQuestionQueryEngine.from_defaults(
    query_engine_tools=query_engine_tools
)
```

Additionally, when dealing with structured data and not knowing which table to use beforehand, a `SQLTableRetrieverQueryEngine` can be constructed by passing a `SQLDatabase` and a retriever built from an `ObjectIndex` that stores table schemas.

time taken:  8.35205078125


In [None]:
start = time.time()

response = query_engine.query("How to setup an agent in code?")

end = time.time()

display(Markdown(response.response))
print("time taken: ", end - start)

To set up an agent, begin by installing the necessary libraries, `llama-index` and `python-dotenv`. You will also need to configure an API key for your chosen language model; for OpenAI, this involves creating a `.env` file in your project's root directory containing `OPENAI_API_KEY=sk-proj-xxxx`.

Next, import the required components in your Python code: `load_dotenv` to load environment variables, `ReActAgent` for the agent itself, `OpenAI` for the language model (or `Ollama` if using a local model), and `FunctionTool` for defining tools.

Define your custom tools by creating Python functions (e.g., `multiply`, `add`) and then wrapping them using `FunctionTool.from_defaults`. These tools are standard Python functions, where docstrings provide metadata for the agent to understand their purpose.

After defining tools, initialize your Large Language Model (LLM). For example, with OpenAI, you would create an instance like `llm = OpenAI(model="gpt-3.5-turbo", temperature=0)`. If opting for local models, you would use `Ollama` after installing `llama-index-llms-ollama` and running a model like `mixtral:8x7b`.

Finally, initialize the agent by instantiating `ReActAgent.from_tools`, passing it an array of your created tools and the initialized LLM. Setting `verbose=True` can help in observing the agent's internal thought process. An example setup would be `agent = ReActAgent.from_tools([multiply_tool, add_tool], llm=llm, verbose=True)`.

time taken:  3.91007661819458


# Setup Long Context Caching


For this section, we will be using the Gemini API


In [None]:
from google import genai
from google.genai import types

client = genai.Client(api_key=userdata.get('GOOGLE_API_KEY'))

In [None]:
import json


def create_text_file(input_file: str, output_file: str) -> None:
    with open(input_file, "r") as f, open(output_file, "w") as out:
        for line in f:
            data = json.loads(line)
            out.write(data["content"] + "\n\n")  # Add two newlines between documents

    print(f"Contents saved to {output_file}")


create_text_file("llama_index_150k.jsonl", "llama_index_contents.txt")

Contents saved to llama_index_contents.txt


In [None]:
import datetime

# Update the cache's time-to-live (ttl)

ttl = f"{int(datetime.timedelta(minutes=10).total_seconds())}s"

document = client.files.upload(file="llama_index_contents.txt")

model_name = "models/gemini-2.0-flash-001"

cache = client.caches.create(
    model=model_name,
    config=types.CreateCachedContentConfig(
        contents=[document],
        system_instruction="You answer questions about the LlamaIndex framework.",
        ttl=ttl,
    ),
)

# # To Update the cache

# # Alternatively, you update the expire_time directly
# # Update the expire_time directly in valid RFC 3339 format (UTC with a "Z" suffix)

# expire_time = (
#     (
#         datetime.datetime.now(datetime.timezone.utc)
#         + datetime.timedelta(minutes=15)
#     )
#     .isoformat()
#     .replace("+00:00", "Z")
# )

# client.caches.update(
#              name=cache.name,
#             config=types.UpdateCachedContentConfig(expire_time=expire_time),
# )

# # To delete cache

# client.caches.delete(name=cache.name)

In [None]:
# Use the cache for generation
import time
start = time.time()

response = client.models.generate_content(
    model=model_name,
    contents="What is LlamaParse, How to setup? Explain detail",
    config=types.GenerateContentConfig(cached_content=cache.name),
)

end = time.time()
display(Markdown(response.text))
print("time taken: ", end - start)

Okay, I can help you understand LlamaParse and how to set it up.

**What is LlamaParse?**

LlamaParse is a state-of-the-art document parsing solution developed by LlamaIndex. It's designed to reliably extract text and structure from documents, especially those with complex layouts like PDFs. It's offered as part of LlamaCloud and also as a self-serve API. The goal is to provide production-quality data for your LLM applications.
***Key Features and Benefits:***

1.  **High Accuracy:** LlamaParse excels at accurately extracting content from documents, even those with complex layouts (multi-column layouts, tables, figures, etc.) where other parsers often fail.
2.  **Structure Preservation:** It attempts to preserve the structural elements of the document, not just the raw text. This means it can help you understand headings, tables, and other layout features.
3.  **Production-Ready Data:** The parsing quality makes it suitable for use in production LLM applications, where the accuracy of the data is crucial.
4.  **Easy Integration:** LlamaParse is designed to integrate seamlessly with the LlamaIndex framework.

**How to set up LlamaParse:**
***Option 1: Using LlamaCloud (Hosted Service)***

This is the easiest way to get started, especially for enterprise developers.

**Steps:**

1.  **Sign Up:**  Go to the LlamaCloud website ([https://cloud.llamaindex.ai/](https://cloud.llamaindex.ai/)) and sign up for an account.
2.  **Get an API Key:** Once you're logged in, you'll find your LlamaCloud API key in your account dashboard.
3.  **Set the API Key in your Environment:**  In your Python environment, set the `LLAMA_CLOUD_API_KEY` environment variable:

```bash
export LLAMA_CLOUD_API_KEY=llx-your_api_key
```

4.  **Install `llama-parse` Package:**
```bash
pip install llama-parse
```
5.  **Using the LlamaParse Loader**
```python
from llama_parse import LlamaParse
documents = LlamaParse(result_type="markdown").load_data(
    "./data/2023_canadian_budget.pdf"
)
```
***Option 2: Managed services***

If you're an enterprise developer, check out [**LlamaCloud**](https://llamaindex.ai/enterprise). It is an end-to-end managed service for data parsing, ingestion, indexing, and retrieval, allowing you to get production-quality data for your production LLM application. It's available both hosted on our servers or as a self-hosted solution.

***LlamaParse***

LlamaParse is our state-of-the-art document parsing solution. It's available as part of LlamaCloud and also available as a self-serve API. You can [sign up](https://cloud.llamaindex.ai/) and parse up to 1000 pages/day for free, or enter a credit card for unlimited parsing. [Learn more](https://llamaindex.ai/enterprise).

**Example Code (Python)**

Here's how you would typically use LlamaParse within a LlamaIndex pipeline:

```python
import os
from llama_index.core import VectorStoreIndex
from llama_index.core import SimpleDirectoryReader, Settings

# Set the API key (if you haven't already)
os.environ["LLAMA_CLOUD_API_KEY"] = "llx-your_api_key" #replace with your actual API key

from llama_parse import LlamaParse

# Load data using LlamaParse
documents = LlamaParse(result_type="markdown").load_data(
    "./data/your_document.pdf"  # Replace with your PDF file
)

# Create an index from the parsed documents
index = VectorStoreIndex.from_documents(documents)

# Now you can query the index as usual
query_engine = index.as_query_engine()
response = query_engine.query("Your question about the document")
print(response)
```

**Important Notes:**

*   **Cost:**  Be aware of the pricing for LlamaParse, especially if you're using the self-serve API for a large volume of documents.
*   **Alternatives:**  If LlamaParse doesn't meet your needs (e.g., due to cost or specific document formats), explore other data connectors/readers in LlamaHub ([https://llamahub.ai/](https://llamahub.ai/)). There are many options for different data sources and parsing methods.
*   **Error Handling:** Implement robust error handling in your code to catch potential issues during the parsing process.

Let me know if you have other questions.


time taken:  6.857192277908325


In [None]:
response.usage_metadata

GenerateContentResponseUsageMetadata(
  cache_tokens_details=[
    ModalityTokenCount(
      modality=<MediaModality.TEXT: 'TEXT'>,
      token_count=212635
    ),
  ],
  cached_content_token_count=212635,
  candidates_token_count=1044,
  candidates_tokens_details=[
    ModalityTokenCount(
      modality=<MediaModality.TEXT: 'TEXT'>,
      token_count=1044
    ),
  ],
  prompt_token_count=212646,
  prompt_tokens_details=[
    ModalityTokenCount(
      modality=<MediaModality.TEXT: 'TEXT'>,
      token_count=212646
    ),
  ],
  total_token_count=213690
)

## First token response time in Straming

In [None]:
start = time.time()

response = client.models.generate_content(
    model=model_name,
    contents="How to setup a Router query engine?",
    config=types.GenerateContentConfig(cached_content=cache.name,max_output_tokens=1),
)
end = time.time()
display(Markdown(response.text))
print("time taken: ", end - start)

```

time taken:  3.200701951980591
