In [13]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Latency benchmark for Context Caching with the Gemini API

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/tvaroska/samples/blob/main/notebook/timing_caching.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2Ftvaroska%2Fsamples%2Fmain%2Fnotebooks%2Ftiming_caching.ipynb">
      <img width="32px" src="https://cloud.google.com/ml-engine/images/colab-enterprise-logo-32px.png" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>    
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/tvaroska/samples/notebooks/timing_caching.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/tvaroska/samples/blob/main/notebooks/timing_caching.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

| | |
|-|-|
|Author(s) | [Boris Tvaroska](https://github.com/tvaroska)|

## Overview

### Context Caching

The Gemini API provides the context caching feature for developers to store frequently used input tokens in a dedicated cache and reference them for subsequent requests, eliminating the need to repeatedly pass the same set of tokens to a model. This feature can help reduce the number of tokens sent to the model, thereby lowering the cost of requests that contain repeat content with high input token counts.

### Objectives

In this we will showcase latency improvement of caching large files before the requests.

## Get started

### Install Vertex AI SDK and other required packages


In [2]:
%pip install --upgrade --user --quiet google-cloud-aiplatform

[31mERROR: Can not perform a '--user' install. User site-packages are not visible in this virtualenv.[0m[31m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


### Restart runtime

To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.

The restart might take a minute or longer. After it's restarted, continue to the next step.

In [3]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [4]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Set Google Cloud project information and initialize Vertex AI SDK

To get started using Vertex AI, you must have an existing Google Cloud project and [enable the Vertex AI API](https://console.cloud.google.com/flows/enableapi?apiid=aiplatform.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

In [5]:
PROJECT_ID = "[your-project]"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

### Import libraries

In [1]:
import datetime

from timeit import default_timer as timer
from tqdm.notebook import tqdm

import vertexai
from vertexai.generative_models import Part
from vertexai.preview import caching
from vertexai.preview.generative_models import GenerativeModel

In [2]:
MODEL_ID = "gemini-1.5-pro-001"  # @param {type:"string"}

RUNS = 10 # How many requests to test with

In [3]:
system_instruction = """
You are an expert researcher who has years of experience in conducting systematic literature surveys and meta-analyses of different topics.
You pride yourself on incredible accuracy and attention to detail. You always stick to the facts in the sources provided, and never make up new facts.
Now look at the research paper below, and answer the following questions in 1-2 sentences.
"""

prompt = Part.from_text("What is the research goal shared by these research papers?")

contents = [
    Part.from_uri(
        "gs://cloud-samples-data/generative-ai/pdf/2312.11805v3.pdf",
        mime_type="application/pdf",
    ),
    Part.from_uri(
        "gs://cloud-samples-data/generative-ai/pdf/2403.05530.pdf",
        mime_type="application/pdf",
    ),
]

In [4]:
no_cache_model = GenerativeModel(model_name=MODEL_ID, system_instruction=system_instruction)

In [5]:
no_cache_generate = []

for _ in tqdm(range(RUNS)):
    start = timer()
    no_cache_model.generate_content(contents = [prompt] + contents)
    end = timer()

    no_cache_generate.append(end - start)

avg_no_cache = float(sum(no_cache_generate) / len(no_cache_generate))
print(f"Non-cached version generation time: {avg_no_cache}") # Time in seconds, e.g. 5.38091952400282

  0%|          | 0/10 [00:00<?, ?it/s]

Non-cached version generation time: 30.159632882406004


In [6]:
start = timer()
cached_content = caching.CachedContent.create(
    model_name=MODEL_ID,
    system_instruction=system_instruction,
    contents=contents,
    ttl=datetime.timedelta(minutes=60),
)

end = timer()

cache_storing = end - start

print(f"Cached version store time: {cache_storing}") # Time in seconds, e.g. 5.38091952400282

Cached version store time: 24.736746133014094


In [7]:
cached_model = GenerativeModel.from_cached_content(cached_content=cached_content)

Caching is usefull for multiple interactions with content - we will do 2 questions to the same documents

In [8]:
cache_generate = []

for _ in tqdm(range(RUNS)):
    start = timer()

    response = cached_model.generate_content(
        "What is the research goal shared by these research papers?"
    )

    end = timer()
    cache_generate.append(end - start)

avg_cache = float(sum(cache_generate) / len(cache_generate))
print(f"Cached version generation time: {cache_generate[0]}") # Time in seconds, e.g. 5.38091952400282

  0%|          | 0/10 [00:00<?, ?it/s]

Cached version generation time: 28.236969568999484


In [10]:
start = timer()

cached_content.delete()

end = timer()
cache_delete = end - start
print(f"Delete cache time: {cache_delete}") # Time in seconds, e.g. 5.38091952400282

Deleting CachedContent : projects/745535691203/locations/us-central1/cachedContents/6965414958053982208
Delete cache time: 0.12102052301634103


## Summary

In [12]:
print(f"Time to generate answer without cache : {no_cache_generate}")
print("--------------------------------------------------")
print(f"Time to submit documents to cache     : {cache_storing}")
print(f"Time to generate answer with cache    : {cache_generate}")
print(f"Time to delete cache                  : {cache_delete}")

Time to generate answer without cache : [30.08364355802769, 28.209958852035925, 33.597342441033106, 39.158240033022594, 26.773604198999237, 26.96953611599747, 26.357575650967192, 30.852880140999332, 32.19091235694941, 27.40263547602808]
--------------------------------------------------
Time to submit documents to cache     : 24.736746133014094
Time to generate answer with cache    : [28.236969568999484, 28.84433871298097, 39.46096307603875, 23.559182040044107, 22.769824632967357, 30.726974983001128, 26.36174183798721, 30.48225156898843, 42.67605299997376, 22.163112573034596]
Time to delete cache                  : 0.12102052301634103
