In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Quick start with Model Garden - MedGemma

<table><tbody><tr>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2Fgoogle-health%2Fmedgemma%2Fmain%2Fnotebooks%2Fquick_start_with_model_garden.ipynb">
      <img alt="Google Cloud Colab Enterprise logo" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" width="32px"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/google-health/medgemma/blob/main/notebooks/quick_start_with_model_garden.ipynb">
      <img alt="GitHub logo" src="https://github.githubassets.com/assets/GitHub-Mark-ea2971cee799.png" width="32px"><br> View on GitHub
    </a>
  </td>
</tr></tbody></table>

## Overview

This notebook demonstrates how to use MedGemma in Vertex AI to generate responses from medical text and images using two methods for getting predictions:

* **Online predictions** are synchronous requests that are made to the endpoint deployed from Model Garden and are served with low latency. Online predictions are useful if the responses are being used in production. The cost for online prediction is based on the time a virtual machine spends waiting in an active state (an endpoint with a deployed model) to handle prediction requests.

* **Batch predictions** are asynchronous requests that are run on a set number of inputs specified in a single job. They are made directly to an uploaded model and do not use an endpoint deployed from Model Garden. Batch predictions are useful if you want to generate responses for a large number of inputs for use in training and don't require low latency. The cost for batch prediction is based on the time a virtual machine spends running your prediction job.

Vertex AI makes it easy to serve your model and make it accessible to the world. Learn more about [Vertex AI](https://cloud.google.com/vertex-ai/docs/start/introduction-unified-platform).

### Objectives

- Deploy MedGemma to a Vertex AI Endpoint and get online predictions.
- Upload MedGemma to Vertex AI Model Registry and get batch predictions.

### Costs

This tutorial uses billable components of Google Cloud:

* Vertex AI
* Cloud Storage

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing), [Cloud Storage pricing](https://cloud.google.com/storage/pricing), and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Before you begin

In [None]:
# @title Import packages and define common functions

import datetime
import importlib
import json
import os
import uuid

from google.cloud import aiplatform, storage
from IPython.display import Image as IPImage, display, Markdown
import google.auth
import openai

if not os.path.isdir("vertex-ai-samples"):
    ! git clone https://github.com/GoogleCloudPlatform/vertex-ai-samples.git

common_util = importlib.import_module(
    "vertex-ai-samples.community-content.vertex_model_garden.model_oss.notebook_util.common_util"
)

models, endpoints = {}, {}

In [None]:
# @title Set up Google Cloud environment

# @markdown #### Prerequisites

# @markdown 1. Make sure that [billing is enabled](https://cloud.google.com/billing/docs/how-to/modify-project) for your project.

# @markdown 2. Make sure that either the Compute Engine API is enabled or that you have the [Service Usage Admin](https://cloud.google.com/iam/docs/understanding-roles#serviceusage.serviceUsageAdmin) (`roles/serviceusage.serviceUsageAdmin`) role to enable the API.

# @markdown This section sets the default Google Cloud project and region, enables the Compute Engine API (if not already enabled), and initializes the Vertex AI API.

# Get the default project ID.
PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]

# Get the default region for launching jobs.
REGION = os.environ["GOOGLE_CLOUD_REGION"]

# Enable the Compute Engine API, if not already.
print("Enabling Compute Engine API.")
! gcloud services enable compute.googleapis.com

# Initialize Vertex AI API.
print("Initializing Vertex AI API.")
aiplatform.init(project=PROJECT_ID, location=REGION)

## Get online predictions

In [None]:
# @title Import deployed model

# @markdown To get [online predictions](https://cloud.google.com/vertex-ai/docs/predictions/get-online-predictions), you will need a MedGemma [Vertex AI Endpoint](https://cloud.google.com/vertex-ai/docs/general/deployment) that has been deployed from Model Garden. If you have not already done so, go to the [MedGemma model card](https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/medgemma) and click "Deploy options > Vertex AI" to deploy the model.

# @markdown **Note:** The examples in this notebook are intended to be used with instruction-tuned variants. Make sure to use an instruction-tuned model variant to run this notebook.

# @markdown This section gets the Vertex AI Endpoint resource that you deployed from Model Garden to use for online predictions.

# @markdown Fill in the endpoint ID and region below. You can find your deployed endpoint on the [Vertex AI online prediction page](https://console.cloud.google.com/vertex-ai/online-prediction/endpoints).

ENDPOINT_ID = ""  # @param {type: "string", placeholder:"e.g. 123456789"}
ENDPOINT_REGION = ""  # @param {type: "string", placeholder:"e.g. us-central1"}

# @markdown Set `use_dedicated_endpoint` if you are using a [dedicated endpoint](https://cloud.google.com/vertex-ai/docs/predictions/choose-endpoint-type) (`True` by default for Model Garden deployments). Uncheck this option for all other endpoint types.

use_dedicated_endpoint = True  # @param {type: "boolean"}

# @markdown Set `is_thinking` to `True` to turn on thinking mode. **Note:** Thinking is supported for the 27B variants only.
is_thinking = False  # @param {type: "boolean"}

endpoints["endpoint"] = aiplatform.Endpoint(
    endpoint_name=ENDPOINT_ID,
    project=PROJECT_ID,
    location=ENDPOINT_REGION,
)

# Use the endpoint name to check that you are using an appropriate model variant.
# These checks are based on the default endpoint name from the Model Garden
# deployment settings.
ENDPOINT_NAME = endpoints["endpoint"].display_name
if "pt" in ENDPOINT_NAME:
    raise ValueError(
        "The examples in this notebook are intended to be used with "
        "instruction-tuned variants. Please use an instruction-tuned model."
    )

### Run inference on images and text

This section demonstrates running inference on image-based tasks using multimodal variants.

**Note:** Proceed to [Run inference on text only](#scrollTo=lRQDWe2znWn7) if you have selected the 27B text-only variant.

In [None]:
# @title #### Specify image and text inputs

# Check that you are using a multimodal variant
if "text" in ENDPOINT_NAME:
    raise ValueError(
        "You are using a text-only variant which does not support multimodal"
        " inputs. Please proceed to the 'Run inference on text only' section."
    )

system_instruction = "You are an expert radiologist."
prompt = "Describe this X-ray" # @param {type:"string"}
image_url = "https://upload.wikimedia.org/wikipedia/commons/c/c8/Chest_Xray_PA_3-8-2010.png" # @param {type:"string"}
! wget -nc -q {image_url}
image_filename = os.path.basename(image_url)

In [None]:
# @title #### Format conversation

if "27b" in ENDPOINT_NAME and is_thinking:
    system_instruction = f"SYSTEM INSTRUCTION: think silently if needed. {system_instruction}"
    max_tokens = 1500
else:
    max_tokens = 500

messages = [
    {
        "role": "system",
        "content": [{"type": "text", "text": system_instruction}]
    },
    {
        "role": "user",
        "content": [
            {"type": "text", "text": prompt},
            {"type": "image_url", "image_url": {"url": image_url}}
        ]
    }
]

In [None]:
# @title #### Generate responses using Vertex AI prediction

# @markdown This section shows how to send [chat completions](https://platform.openai.com/docs/api-reference/chat) requests to the endpoint using Vertex AI [prediction](https://cloud.google.com/vertex-ai/docs/predictions/get-online-predictions).

# @markdown Click "Show Code" to see more details.

display(Markdown(f"---\n\n**[ User ]**\n\n{prompt}"))
display(IPImage(filename=image_filename, height=300))

instances = [
    {
        "@requestFormat": "chatCompletions",
        "messages": messages,
        "max_tokens": max_tokens,
        "temperature": 0
    },
]

response = endpoints["endpoint"].predict(
    instances=instances, use_dedicated_endpoint=use_dedicated_endpoint
).predictions["choices"][0]["message"]["content"]

if "27b" in ENDPOINT_NAME and is_thinking:
    thought, response = response.split("<unused95>")
    thought = thought.replace("<unused94>thought\n", "")
    display(Markdown(f"---\n\n**[ MedGemma thinking ]**\n\n{thought}"))
display(Markdown(f"---\n\n**[ MedGemma ]**\n\n{response}\n\n---"))

In [None]:
# @title #### Generate responses using OpenAI SDK

# @markdown This section shows how to send [chat completions](https://platform.openai.com/docs/api-reference/chat) requests to the endpoint using the OpenAI SDK.

# @markdown Click "Show Code" to see more details.

display(Markdown(f"---\n\n**[ User ]**\n\n{prompt}"))
display(IPImage(filename=image_filename, height=300))

creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

ENDPOINT_RESOURCE_NAME = endpoints["endpoint"].resource_name

if use_dedicated_endpoint:
    DEDICATED_ENDPOINT_DNS = endpoints["endpoint"].gca_resource.dedicated_endpoint_dns
    BASE_URL = f"https://{DEDICATED_ENDPOINT_DNS}/v1beta1/{ENDPOINT_RESOURCE_NAME}"
else:
    BASE_URL = f"https://{ENDPOINT_REGION}-aiplatform.googleapis.com/v1beta1/{ENDPOINT_RESOURCE_NAME}"

client = openai.OpenAI(base_url=BASE_URL, api_key=creds.token)

model_response = client.chat.completions.create(
    model="",
    messages=messages,
    max_completion_tokens=max_tokens,
    temperature=0,
)
response = model_response.choices[0].message.content

if "27b" in ENDPOINT_NAME and is_thinking:
    thought, response = response.split("<unused95>")
    thought = thought.replace("<unused94>thought\n", "")
    display(Markdown(f"---\n\n**[ MedGemma thinking ]**\n\n{thought}"))
display(Markdown(f"---\n\n**[ MedGemma ]**\n\n{response}\n\n---"))

### Run inference on text only

This section demonstrates running inference on text-based tasks.

In [None]:
# @title #### Specify text prompt

system_instruction = "You are a helpful medical assistant."
prompt = "How do you differentiate bacterial from viral pneumonia?"  # @param {type:"string"}

if "27b" in ENDPOINT_NAME and is_thinking:
    system_instruction = f"SYSTEM INSTRUCTION: think silently if needed. {system_instruction}"
    max_tokens = 2000
else:
    max_tokens = 1000

messages = [
    {
        "role": "system",
        "content": system_instruction
    },
    {
        "role": "user",
        "content": prompt
    }
]

In [None]:
# @title #### Generate responses using Vertex AI prediction

# @markdown This section shows how to send [chat completions](https://platform.openai.com/docs/api-reference/chat) requests to the endpoint using Vertex AI [prediction](https://cloud.google.com/vertex-ai/docs/predictions/get-online-predictions).

# @markdown Click "Show Code" to see more details.

display(Markdown(f"---\n\n**[ User ]**\n\n{prompt}\n\n---"))

instances = [
    {
        "@requestFormat": "chatCompletions",
        "messages": messages,
        "max_tokens": max_tokens,
        "temperature": 0
    },
]

response = endpoints["endpoint"].predict(
    instances=instances, use_dedicated_endpoint=use_dedicated_endpoint
).predictions["choices"][0]["message"]["content"]

if "27b" in ENDPOINT_NAME and is_thinking:
    thought, response = response.split("<unused95>")
    thought = thought.replace("<unused94>thought\n", "")
    display(Markdown(f"**[ MedGemma thinking ]**\n\n{thought}\n\n---"))
display(Markdown(f"**[ MedGemma ]**\n\n{response}\n\n---"))

In [None]:
# @title #### Generate responses using OpenAI SDK

# @markdown This section shows how to send [chat completions](https://platform.openai.com/docs/api-reference/chat) requests to the endpoint using the OpenAI SDK.

# @markdown Click "Show Code" to see more details.

display(Markdown(f"---\n\n**[ User ]**\n\n{prompt}\n\n---"))

creds, project = google.auth.default()
auth_req = google.auth.transport.requests.Request()
creds.refresh(auth_req)

ENDPOINT_RESOURCE_NAME = endpoints["endpoint"].resource_name

if use_dedicated_endpoint:
    DEDICATED_ENDPOINT_DNS = endpoints["endpoint"].gca_resource.dedicated_endpoint_dns
    BASE_URL = f"https://{DEDICATED_ENDPOINT_DNS}/v1beta1/{ENDPOINT_RESOURCE_NAME}"
else:
    BASE_URL = f"https://{ENDPOINT_REGION}-aiplatform.googleapis.com/v1beta1/{ENDPOINT_RESOURCE_NAME}"

client = openai.OpenAI(base_url=BASE_URL, api_key=creds.token)

model_response = client.chat.completions.create(
    model="",
    messages=messages,
    max_completion_tokens=max_tokens,
    temperature=0,
)
response = model_response.choices[0].message.content

if "27b" in ENDPOINT_NAME and is_thinking:
    thought, response = response.split("<unused95>")
    thought = thought.replace("<unused94>thought\n", "")
    display(Markdown(f"**[ MedGemma thinking ]**\n\n{thought}\n\n---"))
display(Markdown(f"**[ MedGemma ]**\n\n{response}\n\n---"))

## Get batch predictions

In [None]:
# @title Get access to MedGemma

# @markdown The prediction container directly loads the model from Hugging Face Hub.

# @markdown To enable access to the MedGemma models, you must provide a Hugging Face User Access Token. You can follow the [Hugging Face documentation](https://huggingface.co/docs/hub/en/security-tokens) to create a **read** access token and specify it in the `HF_TOKEN` field below.

HF_TOKEN = ""  # @param {type:"string", placeholder:"Hugging Face User Access Token"}


In [None]:
# @title Upload model to Vertex AI Model Registry

# @markdown To get [batch predictions](https://cloud.google.com/vertex-ai/docs/predictions/get-batch-predictions), you must first upload the prebuilt MedGemma model to [Vertex AI Model Registry](https://cloud.google.com/vertex-ai/docs/model-registry/introduction). Batch prediction requests are made directly to a model in Model Registry without deploying to an endpoint.

# Note: Batch predictions may not work for the 27B variants so they are not included in the dropdown.
MODEL_VARIANT = "4b-it"  # @param ["4b-it"]

MODEL_ID = f"medgemma-{MODEL_VARIANT}"

# The pre-built serving docker image.
SERVE_DOCKER_URI = "us-docker.pkg.dev/vertex-ai/vertex-vision-model-garden-dockers/pytorch-vllm-serve:20250430_0916_RC00_maas"

# This notebook uses Nvidia L4 GPUs for demonstration.
# See https://cloud.google.com/vertex-ai/docs/predictions/configure-compute#batch_prediction
# for details on configuring compute for Vertex AI batch predictions.
if "4b" in MODEL_ID:
    accelerator_type = "NVIDIA_L4"
    machine_type = "g2-standard-24"
    accelerator_count = 2
elif "27b" in MODEL_ID:
    accelerator_type = "NVIDIA_L4"
    machine_type = "g2-standard-48"
    accelerator_count = 4
else:
    raise ValueError(
        f"Recommended machine settings not found for model: {MODEL_ID}."
    )


def upload_model(
    model_name: str,
    model_id: str,
    accelerator_count: int = 1,
    gpu_memory_utilization: float = 0.95,
    max_model_len: int = 32768,
    max_num_seqs: int = 16,
    max_images: int = 16,
) -> aiplatform.Model:

    vllm_args = [
        "python",
        "-m",
        "vllm.entrypoints.api_server",
        "--host=0.0.0.0",
        "--port=8080",
        f"--model={model_id}",
        f"--tensor-parallel-size={accelerator_count}",
        "--swap-space=16",
        f"--gpu-memory-utilization={gpu_memory_utilization}",
        f"--max-model-len={max_model_len}",
        f"--max-num-seqs={max_num_seqs}",
        "--enable-chunked-prefill",
        "--disable-log-stats",
    ]

    if "text" not in model_id:
        vllm_args.extend([
            f"--limit_mm_per_prompt='image={max_images}'",
            "--mm-processor-kwargs='{\"do_pan_and_scan\": true}'"
        ])


    env_vars = {
        "MODEL_ID": model_id,
        "DEPLOY_SOURCE": "notebook",
        "VLLM_USE_V1": "0",
        "HF_TOKEN": HF_TOKEN,
    }

    model = aiplatform.Model.upload(
        display_name=model_name,
        serving_container_image_uri=SERVE_DOCKER_URI,
        serving_container_args=vllm_args,
        serving_container_ports=[8080],
        serving_container_predict_route="/generate",
        serving_container_health_route="/ping",
        serving_container_environment_variables=env_vars,
    )
    return model


models["model"] = upload_model(
    model_name=common_util.get_job_name_with_datetime(prefix=MODEL_ID),
    model_id=f"google/{MODEL_ID}",
    accelerator_count=accelerator_count,
)

In [None]:
# @title Set up Google Cloud resources

# @markdown This section sets up a [Cloud Storage bucket](https://cloud.google.com/storage/docs/creating-buckets) for storing batch prediction inputs and outputs and gets the [Compute Engine default service account](https://cloud.google.com/compute/docs/access/service-accounts#default_service_account) which will be used to run the batch prediction jobs.

# @markdown 1. Make sure that you have the following required roles:
# @markdown - [Storage Admin](https://cloud.google.com/iam/docs/understanding-roles#storage.admin) (`roles/storage.admin`) to create and use Cloud Storage buckets
# @markdown - [Service Account User](https://cloud.google.com/iam/docs/understanding-roles#iam.serviceAccountUser) (`roles/iam.serviceAccountUser`) on either the project or the Compute Engine default service account

# @markdown 2. Set up a Cloud Storage bucket.
# @markdown - A new bucket will automatically be created for you.
# @markdown - [Optional] To use an existing bucket, specify the `gs://` bucket URI. The specified Cloud Storage bucket should be located in the same region as where the notebook was launched. Note that a multi-region bucket (e.g. "us") is not considered a match for a single region (e.g. "us-central1") covered by the multi-region range.

BUCKET_URI = ""  # @param {type:"string", placeholder:"[Optional] Cloud Storage bucket URI"}

# Cloud Storage bucket for storing batch prediction artifacts.
# A unique bucket will be created for the purpose of this notebook. If you
# prefer using your own GCS bucket, change the value of BUCKET_URI above.
if BUCKET_URI is None or BUCKET_URI.strip() == "":
    now = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    BUCKET_URI = f"gs://{PROJECT_ID}-tmp-{now}-{str(uuid.uuid4())[:4]}"
    BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])
    ! gcloud storage buckets create --location {REGION} {BUCKET_URI}
else:
    assert BUCKET_URI.startswith("gs://"), "BUCKET_URI must start with `gs://`."
    BUCKET_NAME = "/".join(BUCKET_URI.split("/")[:3])
    shell_output = ! gcloud storage buckets describe {BUCKET_NAME} | grep "location:" | sed "s/location://"
    bucket_region = shell_output[0].strip().lower()
    if bucket_region != REGION:
        raise ValueError(
            f"Bucket region {bucket_region} is different from notebook region {REGION}"
        )
print(f"Using this Cloud Storage Bucket: {BUCKET_URI}")

# Service account used for running the prediction container.
# Gets the Compute Engine default service account. If you prefer using your own
# custom service account, change the value of SERVICE_ACCOUNT below.
shell_output = ! gcloud projects describe $PROJECT_ID
project_number = shell_output[-1].split(":")[1].strip().replace("'", "")
SERVICE_ACCOUNT = f"{project_number}-compute@developer.gserviceaccount.com"
print("Using this service account:", SERVICE_ACCOUNT)

### Predict

You can send [batch prediction requests](https://cloud.google.com/vertex-ai/docs/predictions/get-batch-predictions#request_a_batch_prediction) to the model using a [JSON Lines](https://jsonlines.org/) file to specify a list of input instances with text prompts and images to generate output. For more details on configuring batch prediction jobs, see how to [format your input data](https://cloud.google.com/vertex-ai/docs/predictions/get-batch-predictions#input_data_requirements) and [choose compute settings](https://cloud.google.com/vertex-ai/docs/predictions/get-batch-predictions#choose_machine_type_and_replica_count).

In [None]:
# @title Generate responses in batch from images and text

# @markdown This section demonstrates running inference in batch on image-based tasks using multimodal variants.

# @markdown **Note:** Proceed to [Generate responses in batch from text only](#scrollTo=sbMkoiJ161hO) if you have selected the 27B text-only variant.

# @markdown Click "Show Code" to see more details.

# Check that you are using a multimodal variant
if "text" in MODEL_VARIANT:
    raise ValueError(
        "You are using a text-only variant which does not support multimodal "
        "inputs. Please proceed to the 'Generate responses in batch from text "
        "only' section."
    )

batch_predict_instances = [
    {
        "@requestFormat": "chatCompletions",
        "messages": [
            {
                "role": "system",
                "content": [{"type": "text", "text": "You are an expert radiologist."}]
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Describe this X-ray"
                    },
                    {
                        "type": "image_url",
                        "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/c/c8/Chest_Xray_PA_3-8-2010.png"}
                    }
                ]
            }
        ],
        "max_tokens": 200
    }
]

# Write instances to JSON Lines file
os.makedirs("batch_predict_input", exist_ok=True)
instances_filename = "multimodal_instances.jsonl"
with open(f"batch_predict_input/{instances_filename}", "w") as f:
  for line in batch_predict_instances:
    json_str = json.dumps(line)
    f.write(json_str)
    f.write("\n")

# Copy the file to Cloud Storage
batch_predict_prefix = f"batch-predict-{MODEL_ID}"
! gcloud storage cp ./batch_predict_input/{instances_filename} {BUCKET_URI}/{batch_predict_prefix}/input/{instances_filename}

batch_predict_job_name = common_util.get_job_name_with_datetime(
    prefix=f"batch-predict-{MODEL_ID}"
)

multimodal_batch_predict_job = models["model"].batch_predict(
    job_display_name=batch_predict_job_name,
    gcs_source=os.path.join(
        BUCKET_URI, batch_predict_prefix, f"input/{instances_filename}"
    ),
    gcs_destination_prefix=os.path.join(
        BUCKET_URI, batch_predict_prefix, "output"
    ),
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    service_account=SERVICE_ACCOUNT,
)

multimodal_batch_predict_job.wait()

print(multimodal_batch_predict_job.display_name)
print(multimodal_batch_predict_job.resource_name)
print(multimodal_batch_predict_job.state)

In [None]:
# @title Generate responses in batch from text only

# @markdown This section demonstrates running inference in batch on text-based tasks.

# @markdown Click "Show Code" to see more details.

batch_predict_instances = [
    {
        "@requestFormat": "chatCompletions",
        "messages": [
            {
                "role": "system",
                "content": "You are a helpful medical assistant."
            },
            {
                "role": "user",
                "content": "How do you differentiate bacterial from viral pneumonia?"
            }
        ],
        "max_tokens": 200
    }
]

# Write instances to JSON Lines file
os.makedirs("batch_predict_input", exist_ok=True)
instances_filename = "text_instances.jsonl"
with open(f"batch_predict_input/{instances_filename}", "w") as f:
  for line in batch_predict_instances:
    json_str = json.dumps(line)
    f.write(json_str)
    f.write("\n")

# Copy the file to Cloud Storage
batch_predict_prefix = f"batch-predict-{MODEL_ID}"
! gcloud storage cp ./batch_predict_input/{instances_filename} {BUCKET_URI}/{batch_predict_prefix}/input/{instances_filename}

batch_predict_job_name = common_util.get_job_name_with_datetime(
    prefix=f"batch-predict-{MODEL_ID}"
)

text_batch_predict_job = models["model"].batch_predict(
    job_display_name=batch_predict_job_name,
    gcs_source=os.path.join(
        BUCKET_URI, batch_predict_prefix, f"input/{instances_filename}"
    ),
    gcs_destination_prefix=os.path.join(
        BUCKET_URI, batch_predict_prefix, "output"
    ),
    machine_type=machine_type,
    accelerator_type=accelerator_type,
    accelerator_count=accelerator_count,
    service_account=SERVICE_ACCOUNT,
)

text_batch_predict_job.wait()

print(text_batch_predict_job.display_name)
print(text_batch_predict_job.resource_name)
print(text_batch_predict_job.state)

In [None]:
# @title #### Get prediction results

# @markdown This section shows an example of [retrieving batch prediction results](https://cloud.google.com/vertex-ai/docs/predictions/get-batch-predictions#retrieve_batch_prediction_results) from the JSON Lines file(s) in the output Cloud Storage location.

# @markdown Click "Show Code" to see more details.

def download_gcs_files_as_json(gcs_files_prefix):
    """Download specified files from Cloud Storage and convert content to JSON."""
    lines = []
    client = storage.Client()
    bucket = storage.bucket.Bucket.from_string(BUCKET_NAME, client)
    blobs = bucket.list_blobs(prefix=gcs_files_prefix)
    for blob in blobs:
        with blob.open("r") as f:
            for line in f:
                lines.append(json.loads(line))
    return lines


# Get results from the first batch prediction job (with multimodal inputs)
# You can replace this variable to get results from another batch prediction job
batch_predict_job = multimodal_batch_predict_job
batch_predict_output_dir = batch_predict_job.output_info.gcs_output_directory
batch_predict_output_files_prefix = os.path.join(
    batch_predict_output_dir.replace(f"{BUCKET_NAME}/", ""),
    "prediction.results"
)
batch_predict_results = download_gcs_files_as_json(
    gcs_files_prefix=batch_predict_output_files_prefix
)

# Display first batch prediction result
line = batch_predict_results[0]
prediction = line["prediction"]["predictions"]["choices"][0]["message"]["content"]
display(Markdown(prediction))


## Next steps

Explore the other [notebooks](https://github.com/google-health/medgemma/blob/main/notebooks) to learn what else you can do with the model.


## Clean up resources

In [None]:
# @markdown  Delete the experiment models and endpoints to recycle the resources
# @markdown  and avoid unnecessary continuous charges that may incur.

# Undeploy model and delete endpoint.
for endpoint in endpoints.values():
    endpoint.delete(force=True)

# Delete models.
for model in models.values():
    model.delete()

delete_bucket = False  # @param {type:"boolean"}
if delete_bucket:
    ! gsutil -m rm -r $BUCKET_NAME