# Install Packages and Setup Variables


In [1]:
!pip install -q google-generativeai==0.5.4 llama-index-llms-gemini==0.3.7 llama-index==0.11.23 openai==1.59.6

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/150.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.7/150.7 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m454.8/454.8 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m679.1/679.1 kB[0m [31m23.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m41.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m66.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m241.0/241.0 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import time
from IPython.display import Markdown, display

# Set the following API Keys in the Python environment. Will be used later.
# We use OpenAI for the embedding model and Gemini-1.5-flash as our LLM.

os.environ["OPENAI_API_KEY"] = "<YOUR_OPENAI_KEY>"
os.environ["GOOGLE_API_KEY"] = "<YOUR_API_KEY>"

# from google.colab import userdata
# os.environ["OPENAI_API_KEY"] = userdata.get('openai_api_key')
# os.environ["GOOGLE_API_KEY"] = userdata.get('Google_api_key')

# Load Dataset


## Download


The dataset includes a subset of the documentation from the Llama-index library.


In [3]:
!curl -L -o ./llama_index_150k.jsonl https://huggingface.co/datasets/towardsai-buster/llama-index-docs/raw/main/llama_index_data_150k.jsonl

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   115  100   115    0     0    350      0 --:--:-- --:--:-- --:--:--   351
100  570k  100  570k    0     0  1258k      0 --:--:-- --:--:-- --:--:-- 1258k


## Read File and create LlamaIndex Documents


In [4]:
from llama_index.core import Document
import json


def create_docs(input_file: str) -> list[Document]:
    documents = []
    with open(input_file, "r") as f:
        for idx, line in enumerate(f, start=1):

          data = json.loads(line)

          required_keys = {"doc_id", "content", "url", "name", "tokens", "source"}
          if not required_keys.issubset(data):
              print(f"Missing keys in line {idx}: {required_keys - set(data)}")
              continue

          documents.append(
              Document(
                  doc_id=data["doc_id"],
                  text=data["content"],
                  metadata={  # type: ignore
                      "url": data["url"],
                      "title": data["name"],
                      "tokens": data["tokens"],
                      "source": data["source"],
                  },
                  excluded_llm_metadata_keys=[
                      "title",
                      "tokens",
                      "source",
                  ],
                  excluded_embed_metadata_keys=[
                      "url",
                      "tokens",
                      "source",
                  ],
              )
          )

    return documents


# Convert the texts to Document objects.
documents = create_docs("llama_index_150k.jsonl")
print(f"Number of documents: {len(documents)}")


Number of documents: 56


# Generate Embedding


In [5]:
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding


# Build index / generate embeddings using OpenAI embedding model
index = VectorStoreIndex.from_documents(
    documents,
    embed_model=OpenAIEmbedding(model="text-embedding-3-small"),
    transformations=[SentenceSplitter(chunk_size=512, chunk_overlap=128)],
    show_progress=True,
)

Parsing nodes:   0%|          | 0/56 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/447 [00:00<?, ?it/s]

# Query Dataset


In [6]:
# Define a query engine that is responsible for retrieving related pieces of text,
# and using a LLM to formulate the final answer.

from llama_index.llms.gemini import Gemini

llm = Gemini(model="models/gemini-1.5-flash", temperature=1, max_tokens=1000)

query_engine = index.as_query_engine(llm=llm, similarity_top_k=10)

In [7]:
start = time.time()

response = query_engine.query("How to setup a query engine in code?")

end = time.time()

display(Markdown(response.response))
print("time taken: ", end - start)

There are several ways to set up a query engine, depending on your needs and data.  If you know which tables to query beforehand and the combined size of the schema and prompt fits within your context window, use a query engine directly.  If the table schema is too large for the context window, create a table schema index using  `SQLTableNodeMapping` and `ObjectIndex`, incorporating a `VectorStoreIndex`.  For natural language SQL queries, use `NLSQLTableQueryEngine`, specifying the relevant tables.  For more general querying, use the `index.as_query_engine()` method.  More complex scenarios might involve constructing a `RetrieverQueryEngine` with custom retrievers, postprocessors (like `SimilarityPostprocessor`), and response synthesizers.  Alternatively, a `SubQuestionQueryEngine` can be used for multi-document queries,  creating a `QueryEngineTool` for each index and using them to generate sub-queries.  Finally, for routing queries to different data sources, use a `RouterQueryEngine` with appropriate `QueryEngineTool`s for each sub-index.


time taken:  3.9134440422058105


In [8]:
start = time.time()

response = query_engine.query("How to setup an agent in code?")

end = time.time()

display(Markdown(response.response))
print("time taken: ", end - start)

To create an agent, begin by importing necessary components from LlamaIndex and loading environment variables from a `.env` file.  Then, define basic tools such as functions for multiplication and addition, creating `FunctionTool` objects from them.  Next, initialize the large language model (LLM), for example, using `OpenAI(model="gpt-3.5-turbo", temperature=0)`.  Finally, create the agent using `ReActAgent.from_tools([multiply_tool, add_tool], llm=llm, verbose=True)`, providing the tools and LLM.  For local models, install Ollama and use `Ollama(model="mixtral:8x7b", request_timeout=120.0)` instead of OpenAI.


time taken:  2.789015293121338


# Setup Long Context Caching


For this section, we will be using the Gemini API


Note: You might encounter dependency issues, which may require restarting the session(delete the run time and reinstall). Please reinstall google-generativeai to the latest version. To use long-context caching in google-generativeai, ensure you have version 0.7.2 or higher.

In [1]:
!pip install -q google-generativeai==0.8.3 llama-index==0.12.12 llama-index-llms-gemini==0.4.4

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m241.0/241.0 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.6/49.6 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os

os.environ["GOOGLE_API_KEY"] = "<YOUR_API_KEY>"

# from google.colab import userdata
# os.environ["GOOGLE_API_KEY"] = userdata.get('Google_api_key')

In [3]:
!curl -L -o ./llama_index_150k.jsonl https://huggingface.co/datasets/towardsai-buster/llama-index-docs/raw/main/llama_index_data_150k.jsonl

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   115  100   115    0     0    425      0 --:--:-- --:--:-- --:--:--   425
100  570k  100  570k    0     0  1324k      0 --:--:-- --:--:-- --:--:-- 1324k


In [4]:
import time
import json
from llama_index.core import Document
from IPython.display import Markdown, display


def create_docs(input_file: str) -> list[Document]:
    documents = []
    with open(input_file, "r") as f:
        for idx, line in enumerate(f, start=1):

          data = json.loads(line)

          required_keys = {"doc_id", "content", "url", "name", "tokens", "source"}
          if not required_keys.issubset(data):
              print(f"Missing keys in line {idx}: {required_keys - set(data)}")
              continue

          documents.append(
              Document(
                  doc_id=data["doc_id"],
                  text=data["content"],
                  metadata={  # type: ignore
                      "url": data["url"],
                      "title": data["name"],
                      "tokens": data["tokens"],
                      "source": data["source"],
                  },
                  excluded_llm_metadata_keys=[
                      "title",
                      "tokens",
                      "source",
                  ],
                  excluded_embed_metadata_keys=[
                      "url",
                      "tokens",
                      "source",
                  ],
              )
          )

    return documents


# Convert the texts to Document objects.
documents = create_docs("llama_index_150k.jsonl")
print(f"Number of documents: {len(documents)}")


Number of documents: 56


In [5]:
import os
import google.generativeai as genai
from google.generativeai import caching
from google.generativeai import GenerationConfig

genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

Convert the jsonl file to a text file for the Gemini API

In [6]:
import json


def create_text_file(input_file: str, output_file: str) -> None:
    with open(input_file, "r") as f, open(output_file, "w") as out:
        for line in f:
            data = json.loads(line)
            out.write(data["content"] + "\n\n")  # Add two newlines between documents

    print(f"Contents saved to {output_file}")


create_text_file("llama_index_150k.jsonl", "llama_index_contents.txt")

Contents saved to llama_index_contents.txt


In [7]:
document = genai.upload_file(path="llama_index_contents.txt")
model_name = "gemini-1.5-flash-001"

cache = genai.caching.CachedContent.create(
    model=model_name,
    system_instruction="You answer questions about the LlamaIndex framework.",
    contents=[document],
)

In [8]:
import time
model = genai.GenerativeModel.from_cached_content(cache)

## Response Generation

In [9]:
start = time.time()
response = model.generate_content(
    "What is LlamaParse, How to setup?",
    generation_config=GenerationConfig(max_output_tokens=1000),
)
end = time.time()
display(Markdown(response.text))
print("time taken: ", end - start)

LlamaParse is a document parsing engine that is part of LlamaCloud. It offers state-of-the-art document parsing capabilities, specifically designed to extract structured data from various document formats like PDFs, HTML, and others.  

Here's a breakdown of LlamaParse and its setup:

**What LlamaParse Does**

* **Document Extraction:** It can extract text, tables, and other relevant information from documents, even if they have complex formatting or layouts. This is important for making the information inside of documents accessible for LLMs.
* **Structured Output:** It can output the extracted information in a standardized and structured format, such as JSON, Markdown, or CSV. This makes it easier to integrate the parsed data with your LLM applications. 
* **Handling Complex Documents:** It can handle various document types, including PDFs with complex layouts, nested tables, and other challenging structures.
* **Advanced Parsing Options:** It provides additional features like keyword extraction, topic extraction, and sentiment analysis for enriching the parsed data.

**Setting Up LlamaParse**

1. **Sign up for LlamaCloud:** You need a LlamaCloud account to access LlamaParse. Sign up for free at [https://cloud.llamaindex.ai/](https://cloud.llamaindex.ai/).  
2. **Obtain Your API Key:** After signing up, you will receive a LlamaCloud API key. You'll need this key to access the LlamaParse API.
3. **Install the Library:**  Use pip to install the LlamaParse library in your Python environment:

   ```bash
   pip install llama-parse
   ```
4. **Set Environment Variable:** Configure your environment to use the LlamaCloud API key: 

   ```bash
   export LLAMA_CLOUD_API_KEY="your_llamacloud_api_key"
   ```
5. **Start Using LlamaParse:** 

   ```python
   from llama_parse import LlamaParse

   # Parse a PDF into Markdown
   documents = LlamaParse(result_type="markdown").load_data("./path/to/your/document.pdf")

   # Parse a PDF into JSON
   documents = LlamaParse(result_type="json").load_data("./path/to/your/document.pdf")

   # You can also provide additional options for parsing
   documents = LlamaParse(result_type="markdown", language="french").load_data("./path/to/your/document.pdf")
   ```

**Important Considerations**

* **Free Tier:** LlamaCloud has a free tier that includes a certain amount of parsing credits per day. For larger-scale parsing, you can upgrade to a paid plan. 
* **Self-Hosted Option:** If you require higher levels of privacy or custom deployment, LlamaParse can also be self-hosted. 
* **LlamaHub Integration:** You can easily integrate LlamaParse with LlamaHub to access a wide variety of data connectors and extend the functionality of your LLM applications. 

Let me know if you have any other questions! 


time taken:  6.802876234054565


In [10]:
response.usage_metadata

prompt_token_count: 212097
cached_content_token_count: 212087
candidates_token_count: 640
total_token_count: 212737

## First token response time in Straming

In [11]:
start = time.time()
response = model.generate_content(
    "How to setup a Router query engine?",
    generation_config=GenerationConfig(max_output_tokens=1),
)
end = time.time()
display(Markdown(response.text))
print("time taken: ", end - start)

Let

time taken:  2.8402929306030273
