# Install Packages and Setup Variables


In [1]:
!pip install -q google-generativeai==0.5.4 llama-index-llms-gemini==0.3.7 llama-index==0.11.23 openai==1.59.6

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.7/150.7 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m454.8/454.8 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m679.1/679.1 kB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.6/294.6 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import time
from IPython.display import Markdown, display

# Set the following API Keys in the Python environment. Will be used later.
# We use OpenAI for the embedding model and Gemini-1.5-flash as our LLM.

os.environ["OPENAI_API_KEY"] = "<YOUR_OPENAI_KEY>"
os.environ["GOOGLE_API_KEY"] = "<YOUR_API_KEY>"

# from google.colab import userdata
# os.environ["OPENAI_API_KEY"] = userdata.get('openai_api_key')
# os.environ["GOOGLE_API_KEY"] = userdata.get('Google_api_key')

# Load Dataset


## Download


The dataset includes a subset of the documentation from the Llama-index library.


In [3]:
!curl -L -o ./llama_index_150k.jsonl https://huggingface.co/datasets/towardsai-buster/llama-index-docs/raw/main/llama_index_data_150k.jsonl

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   115  100   115    0     0    487      0 --:--:-- --:--:-- --:--:--   489
100  570k  100  570k    0     0  1508k      0 --:--:-- --:--:-- --:--:-- 1508k


## Read File and create LlamaIndex Documents


In [4]:
from llama_index.core import Document
import json


def create_docs(input_file: str) -> list[Document]:
    documents = []
    with open(input_file, "r") as f:
        for idx, line in enumerate(f, start=1):

          data = json.loads(line)

          required_keys = {"doc_id", "content", "url", "name", "tokens", "source"}
          if not required_keys.issubset(data):
              print(f"Missing keys in line {idx}: {required_keys - set(data)}")
              continue

          documents.append(
              Document(
                  doc_id=data["doc_id"],
                  text=data["content"],
                  metadata={  # type: ignore
                      "url": data["url"],
                      "title": data["name"],
                      "tokens": data["tokens"],
                      "source": data["source"],
                  },
                  excluded_llm_metadata_keys=[
                      "title",
                      "tokens",
                      "source",
                  ],
                  excluded_embed_metadata_keys=[
                      "url",
                      "tokens",
                      "source",
                  ],
              )
          )

    return documents


# Convert the texts to Document objects.
documents = create_docs("llama_index_150k.jsonl")
print(f"Number of documents: {len(documents)}")


Number of documents: 56


# Generate Embedding


In [5]:
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding


# Build index / generate embeddings using OpenAI embedding model
index = VectorStoreIndex.from_documents(
    documents,
    embed_model=OpenAIEmbedding(model="text-embedding-3-small"),
    transformations=[SentenceSplitter(chunk_size=512, chunk_overlap=128)],
    show_progress=True,
)

Parsing nodes:   0%|          | 0/56 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/447 [00:00<?, ?it/s]

# Query Dataset


In [6]:
# Define a query engine that is responsible for retrieving related pieces of text,
# and using a LLM to formulate the final answer.

from llama_index.llms.gemini import Gemini

llm = Gemini(model="models/gemini-1.5-flash", temperature=1, max_tokens=1000)

query_engine = index.as_query_engine(llm=llm, similarity_top_k=10)

In [7]:
start = time.time()

response = query_engine.query("How to setup a query engine in code?")

end = time.time()

display(Markdown(response.response))
print("time taken: ", end - start)

There are several ways to set up a query engine, depending on your data and needs.  If you know which tables you'll query beforehand, and the combined size of the table schema and prompt fits within your context window, use a query engine directly.  Otherwise, if the table schema size exceeds your context window, store it in an index using `SQLTableNodeMapping` and `ObjectIndex`, then build a `SQLTableRetrieverQueryEngine`.  For simpler cases, use `index.as_query_engine()`.  More complex scenarios involving multiple documents or data sources can utilize a `SubQuestionQueryEngine` or `RouterQueryEngine`, combining individual query engines into tools and defining the overall engine over these tools.  Customizing retrieval, post-processing, and response synthesis is also possible using a low-level composition API, allowing granular control over the querying process.


time taken:  4.233275890350342


In [8]:
start = time.time()

response = query_engine.query("How to setup an agent in code?")

end = time.time()

display(Markdown(response.response))
print("time taken: ", end - start)

To create an agent, you'll need to import necessary components from LlamaIndex, load environment variables (if using a `.env` file), and define tools as Python functions wrapped in `FunctionTool` objects.  Then, initialize the LLM (e.g., using `OpenAI` for OpenAI models or `Ollama` for local models) and create the agent using `ReActAgent.from_tools()`, providing the tools and LLM.  Finally, you can interact with the agent using `.chat()`.


time taken:  5.146414756774902


# Setup Long Context Caching


For this section, we will be using the Gemini API


Note: You might encounter dependency issues, which may require restarting the session(delete the run time and reinstall). Please reinstall google-generativeai to the latest version. To use long-context caching in google-generativeai, ensure you have version 0.7.2 or higher.

In [9]:
!pip install -q google-generativeai==0.8.3 llama-index==0.12.12 llama-index-llms-gemini==0.4.4

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/160.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.8/160.8 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m760.0/760.0 kB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m599.1/599.1 kB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.2/129.2 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [22]:
import os

os.environ["GOOGLE_API_KEY"] = "<YOUR_API_KEY>"

# from google.colab import userdata
# os.environ["GOOGLE_API_KEY"] = userdata.get('Google_api_key')

In [23]:
!curl -L -o ./llama_index_150k.jsonl https://huggingface.co/datasets/towardsai-buster/llama-index-docs/raw/main/llama_index_data_150k.jsonl

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   115  100   115    0     0    565      0 --:--:-- --:--:-- --:--:--   563
100  570k  100  570k    0     0  1663k      0 --:--:-- --:--:-- --:--:-- 1663k


In [24]:
import time
import json
from llama_index.core import Document
from IPython.display import Markdown, display


def create_docs(input_file: str) -> list[Document]:
    documents = []
    with open(input_file, "r") as f:
        for idx, line in enumerate(f, start=1):

          data = json.loads(line)

          required_keys = {"doc_id", "content", "url", "name", "tokens", "source"}
          if not required_keys.issubset(data):
              print(f"Missing keys in line {idx}: {required_keys - set(data)}")
              continue

          documents.append(
              Document(
                  doc_id=data["doc_id"],
                  text=data["content"],
                  metadata={  # type: ignore
                      "url": data["url"],
                      "title": data["name"],
                      "tokens": data["tokens"],
                      "source": data["source"],
                  },
                  excluded_llm_metadata_keys=[
                      "title",
                      "tokens",
                      "source",
                  ],
                  excluded_embed_metadata_keys=[
                      "url",
                      "tokens",
                      "source",
                  ],
              )
          )

    return documents


# Convert the texts to Document objects.
documents = create_docs("llama_index_150k.jsonl")
print(f"Number of documents: {len(documents)}")


Number of documents: 56


In [25]:
from google import genai
from google.genai import types

client = genai.Client(api_key=os.environ["GOOGLE_API_KEY"])

In [26]:
import json


def create_text_file(input_file: str, output_file: str) -> None:
    with open(input_file, "r") as f, open(output_file, "w") as out:
        for line in f:
            data = json.loads(line)
            out.write(data["content"] + "\n\n")  # Add two newlines between documents

    print(f"Contents saved to {output_file}")


create_text_file("llama_index_150k.jsonl", "llama_index_contents.txt")

Contents saved to llama_index_contents.txt


In [32]:
document = client.files.upload(file="llama_index_contents.txt")

model_name = "gemini-1.5-flash-001"

cache = client.caches.create(
    model=model_name,
    config=types.CreateCachedContentConfig(
        contents=[document],
        system_instruction="You answer questions about the LlamaIndex framework.",
    ),
)

# # To Update the cache

# import datetime

# # Update the cache's time-to-live (ttl)

# ttl = f"{int(datetime.timedelta(hours=2).total_seconds())}s"
# client.caches.update(
#     name=cache.name, config=types.UpdateCachedContentConfig(ttl=ttl)
# )

# print(f"After update:\n {cache}")

# # Alternatively, update the expire_time directly
# # Update the expire_time directly in valid RFC 3339 format (UTC with a "Z" suffix)

# expire_time = (
#     (
#         datetime.datetime.now(datetime.timezone.utc)
#         + datetime.timedelta(minutes=15)
#     )
#     .isoformat()
#     .replace("+00:00", "Z")
# )

# client.caches.update(
#              name=cache.name,
#             config=types.UpdateCachedContentConfig(expire_time=expire_time),
# )

# # To delete cache

# client.caches.delete(name=cache.name)

In [35]:
# Use the cache for generation
import time
start = time.time()

response = client.models.generate_content(
    model=model_name,
    contents="What is LlamaParse, How to setup?",
    config=types.GenerateContentConfig(cached_content=cache.name,max_output_tokens=1000),
)

end = time.time()
display(Markdown(response.text))
print("time taken: ", end - start)

LlamaParse is a state-of-the-art document parsing solution developed by the LlamaIndex team. It's designed to extract structured information from various document formats, making it easier to analyze and utilize content from sources like PDFs, Word documents, and more. 

Here's a breakdown of LlamaParse:

**Key Features:**

* **Powerful Parsing:**  LlamaParse leverages advanced parsing techniques to handle complex document layouts, tables, images, and even nested structures.
* **Accurate Extraction:** It aims to extract meaningful text and data elements from documents, including tables, lists, headings, and other important components. 
* **Customizability:** You can tailor the parsing process to suit your specific needs by configuring options for formatting, extraction rules, and more.

**Setup:**

1. **Sign Up for LlamaCloud:** LlamaParse is part of the LlamaCloud managed service. You can sign up for a free trial on the LlamaCloud website.
2. **Create an API Key:** Once signed up, you'll receive an API key. This key is essential for accessing the LlamaParse service.
3. **Environment Variable:** Set the `LLAMA_CLOUD_API_KEY` environment variable with the API key you received. 
4. **Install the LlamaParse library:** 
   ```bash
   pip install llama-parse 
   ```

**Example Usage:**

```python
from llama_parse import LlamaParse

# Load a PDF
documents = LlamaParse(result_type="markdown").load_data("./data/my_document.pdf")

# Parse a Word document (docx)
documents = LlamaParse(result_type="text").load_data("./data/my_document.docx")

# Parse a webpage
documents = LlamaParse(result_type="json").load_data("https://www.example.com")
```

**Key Points:**

* **`result_type`:** Specifies the output format you want. Options include:
    * `markdown`: For nicely formatted output.
    * `text`:  For plain text output.
    * `json`: For structured JSON output.
* **Free Tier:** You can parse up to 1,000 pages per day for free on LlamaCloud. 

**Benefits of Using LlamaParse:**

* **Increased Accuracy:**  Improved parsing leads to more accurate data extraction.
* **Simplified Workflow:**  It automates a complex process, allowing you to focus on higher-level tasks.
* **Diverse Formats:**  Supports a wide range of document formats. 
* **Scalability:**  Built for handling large volumes of documents efficiently. 

Let me know if you have any other questions. 


time taken:  5.435086727142334


In [30]:
response.usage_metadata

GenerateContentResponseUsageMetadata(cache_tokens_details=[ModalityTokenCount(modality=<MediaModality.TEXT: 'TEXT'>, token_count=212088)], cached_content_token_count=212088, candidates_token_count=844, candidates_tokens_details=None, prompt_token_count=212098, prompt_tokens_details=[ModalityTokenCount(modality=<MediaModality.TEXT: 'TEXT'>, token_count=212098)], thoughts_token_count=None, tool_use_prompt_token_count=None, tool_use_prompt_tokens_details=None, total_token_count=212942)

## First token response time in Straming

In [31]:
start = time.time()

response = client.models.generate_content(
    model=model_name,
    contents="How to setup a Router query engine?",
    config=types.GenerateContentConfig(cached_content=cache.name,max_output_tokens=1),
)
end = time.time()
display(Markdown(response.text))
print("time taken: ", end - start)

Here

time taken:  1.902991771697998
