# Install Packages and Setup Variables


In [None]:
!pip install -q google-generativeai==0.5.4 llama-index-llms-gemini==0.3.7 llama-index openai

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/150.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.7/150.7 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m679.1/679.1 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m83.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m189.0/189.0 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import os
import time
from IPython.display import Markdown, display

# Set the following API Keys in the Python environment. Will be used later.
# We use OpenAI for the embedding model and Gemini-1.5-flash as our LLM.
os.environ["OPENAI_API_KEY"] = "<YOUR_OPENAI_KEY>"
os.environ["GOOGLE_API_KEY"] = "<YOUR_API_KEY>"

# from google.colab import userdata
# os.environ["OPENAI_API_KEY"] = userdata.get('openai_api_key')
# os.environ["GOOGLE_API_KEY"] = userdata.get('Google_api_key')

# Load Dataset


## Download


The dataset includes a subset of the documentation from the Llama-index library.


In [None]:
!curl -L -o ./llama_index_150k.jsonl https://huggingface.co/datasets/towardsai-buster/llama-index-docs/raw/main/llama_index_data_150k.jsonl

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   115  100   115    0     0    472      0 --:--:-- --:--:-- --:--:--   473
100  570k  100  570k    0     0   922k      0 --:--:-- --:--:-- --:--:--  922k


## Read File and create LlamaIndex Documents


In [None]:
from llama_index.core import Document
import json


def create_docs(input_file: str) -> list[Document]:
    documents = []
    with open(input_file, "r") as f:
        for idx, line in enumerate(f, start=1):

          data = json.loads(line)

          required_keys = {"doc_id", "content", "url", "name", "tokens", "source"}
          if not required_keys.issubset(data):
              print(f"Missing keys in line {idx}: {required_keys - set(data)}")
              continue

          documents.append(
              Document(
                  doc_id=data["doc_id"],
                  text=data["content"],
                  metadata={  # type: ignore
                      "url": data["url"],
                      "title": data["name"],
                      "tokens": data["tokens"],
                      "source": data["source"],
                  },
                  excluded_llm_metadata_keys=[
                      "title",
                      "tokens",
                      "source",
                  ],
                  excluded_embed_metadata_keys=[
                      "url",
                      "tokens",
                      "source",
                  ],
              )
          )

    return documents


# Convert the texts to Document objects.
documents = create_docs("llama_index_150k.jsonl")
print(f"Number of documents: {len(documents)}")


Number of documents: 56


# Generate Embedding


In [None]:
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.embeddings.openai import OpenAIEmbedding


# Build index / generate embeddings using OpenAI embedding model
index = VectorStoreIndex.from_documents(
    documents,
    embed_model=OpenAIEmbedding(model="text-embedding-3-small"),
    transformations=[SentenceSplitter(chunk_size=512, chunk_overlap=128)],
    show_progress=True,
)

Parsing nodes:   0%|          | 0/56 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/447 [00:00<?, ?it/s]

# Query Dataset


In [None]:
# Define a query engine that is responsible for retrieving related pieces of text,
# and using a LLM to formulate the final answer.

from llama_index.llms.gemini import Gemini

llm = Gemini(model="models/gemini-1.5-flash", temperature=1, max_tokens=1000)

query_engine = index.as_query_engine(llm=llm, similarity_top_k=10)

In [None]:
start = time.time()

response = query_engine.query("How to setup a query engine in code?")

end = time.time()

display(Markdown(response.response))
print("time taken: ", end - start)

To create a query engine, first create an index and then use the `as_query_engine()` method. 


time taken:  3.605879545211792


In [None]:
start = time.time()

response = query_engine.query("How to setup an agent in code?")

end = time.time()

display(Markdown(response.response))
print("time taken: ", end - start)

You can create an agent by using the `ReActAgent` class.  First, you should define a list of tools that the agent can use. Next, create an `OpenAI` object to act as the language model, specifying a model such as "gpt-3.5-turbo" with a temperature of 0. Finally, initialize the `ReActAgent` with the list of tools and the LLM, setting the `verbose` parameter to `True` to see the agent's thought process. 


time taken:  2.861610174179077


# Setup Long Context Caching


For this section, we will be using the Gemini API


Note: You might encounter dependency issues, which may require restarting the session. Please reinstall google-generativeai to the latest version. To use long-context caching in google-generativeai, ensure you have version 0.7.2 or higher.

In [None]:
!pip install -q google-generativeai==0.8.3 llama-index

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m189.0/189.0 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.5/49.5 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os

os.environ["GOOGLE_API_KEY"] = "<YOUR_API_KEY>"

In [3]:
!curl -L -o ./llama_index_150k.jsonl https://huggingface.co/datasets/towardsai-buster/llama-index-docs/raw/main/llama_index_data_150k.jsonl

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   115  100   115    0     0    153      0 --:--:-- --:--:-- --:--:--   153
100  570k  100  570k    0     0   340k      0  0:00:01  0:00:01 --:--:--  340k


In [None]:
import time
import json
from llama_index.core import Document
from IPython.display import Markdown, display


def create_docs(input_file: str) -> list[Document]:
    documents = []
    with open(input_file, "r") as f:
        for idx, line in enumerate(f, start=1):

          data = json.loads(line)

          required_keys = {"doc_id", "content", "url", "name", "tokens", "source"}
          if not required_keys.issubset(data):
              print(f"Missing keys in line {idx}: {required_keys - set(data)}")
              continue

          documents.append(
              Document(
                  doc_id=data["doc_id"],
                  text=data["content"],
                  metadata={  # type: ignore
                      "url": data["url"],
                      "title": data["name"],
                      "tokens": data["tokens"],
                      "source": data["source"],
                  },
                  excluded_llm_metadata_keys=[
                      "title",
                      "tokens",
                      "source",
                  ],
                  excluded_embed_metadata_keys=[
                      "url",
                      "tokens",
                      "source",
                  ],
              )
          )

    return documents


# Convert the texts to Document objects.
documents = create_docs("llama_index_150k.jsonl")
print(f"Number of documents: {len(documents)}")


Number of documents: 56


In [5]:
import os
import google.generativeai as genai
from google.generativeai import caching
from google.generativeai import GenerationConfig

genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

Convert the jsonl file to a text file for the Gemini API

In [None]:
import json


def create_text_file(input_file: str, output_file: str) -> None:
    with open(input_file, "r") as f, open(output_file, "w") as out:
        for line in f:
            data = json.loads(line)
            out.write(data["content"] + "\n\n")  # Add two newlines between documents

    print(f"Contents saved to {output_file}")


create_text_file("llama_index_150k.jsonl", "llama_index_contents.txt")

Contents saved to llama_index_contents.txt


In [7]:
document = genai.upload_file(path="llama_index_contents.txt")
model_name = "gemini-1.5-flash-001"

cache = genai.caching.CachedContent.create(
    model=model_name,
    system_instruction="You answer questions about the LlamaIndex framework.",
    contents=[document],
)

In [8]:
import time
model = genai.GenerativeModel.from_cached_content(cache)

## Response Generation

In [15]:
start = time.time()
response = model.generate_content(
    "What is LlamaParse, How to setup?",
    generation_config=GenerationConfig(max_output_tokens=1000),
)
end = time.time()
display(Markdown(response.text))
print("time taken: ", end - start)

LlamaParse is a state-of-the-art document parsing solution developed by LlamaIndex. It is a powerful tool for extracting structured information from unstructured documents like PDFs, Word files, and HTML pages. 

Here's a breakdown of LlamaParse and how to set it up:

**What is LlamaParse?**

* **Document Understanding:** LlamaParse goes beyond simple text extraction. It leverages advanced techniques like natural language processing (NLP) and machine learning to understand the document's structure, identify key entities, and extract relevant information. 
* **Structured Data Extraction:** LlamaParse aims to convert unstructured documents into structured data, making it easier to process, analyze, and query. This can include things like tables, lists, headings, and even text formatting.
* **Data Integration:** The extracted structured data from LlamaParse can be easily integrated into other systems or databases for further analysis or use in LLM applications.

**How to Setup LlamaParse**

LlamaParse is currently offered as a managed service through **LlamaCloud**. You can use it either as a self-hosted solution or utilize their hosted service.

**1. Sign Up for LlamaCloud:**
   - Visit the LlamaCloud website: [https://cloud.llamaindex.ai/](https://cloud.llamaindex.ai/)
   - Sign up for a free account. You get a certain number of free parsing pages per day.

**2. Generate API Key:**
   - Once you are logged in, navigate to the "API Keys" section.
   - Generate a new API key.

**3. Install the LlamaParse library:**
   - Use pip to install the LlamaParse library:
     ```bash
     pip install llama-parse
     ```

**4. Usage Example:**

   ```python
   from llama_parse import LlamaParse

   # Replace with your LlamaCloud API key
   os.environ["LLAMA_CLOUD_API_KEY"] = "your_api_key"

   # Load a PDF file
   documents = LlamaParse(result_type="markdown").load_data(
       "./your_pdf_file.pdf"
   )

   # Process the extracted data
   for document in documents:
       print(document.text)
   ```

**Key Points:**

* **Result Type:** LlamaParse offers various `result_type` options:
    - `markdown`: Provides a formatted markdown version of the document.
    - `json`:  Returns a JSON representation of the structured data.
    - `text`: Extracts the plain text content of the document.
    - `structured`:  Provides a complex structured representation of the document. 
* **Customization:** You can customize LlamaParse to tailor its behavior using various parameters like `result_type`, `parser`, `table_parser`, and more.  Refer to the LlamaParse [documentation](https://docs.cloud.llamaindex.ai/llamaparse/getting_started) for a complete overview.

Let me know if you have any more questions about LlamaParse.


time taken:  10.533137798309326


In [16]:
response.usage_metadata

prompt_token_count: 212097
cached_content_token_count: 212087
candidates_token_count: 646
total_token_count: 212743

## First token response time in Straming

In [11]:
start = time.time()
response = model.generate_content(
    "How to setup a Router query engine?",
    generation_config=GenerationConfig(max_output_tokens=1),
)
end = time.time()
display(Markdown(response.text))
print("time taken: ", end - start)

A

time taken:  5.295310974121094
