In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Gemini: An Overview of Multimodal Use Cases

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/intro_multimodal_use_cases.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Run in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fuse-cases%2Fintro_multimodal_use_cases.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/use-cases/intro_multimodal_use_cases.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/intro_multimodal_use_cases.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/intro_multimodal_use_cases.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/intro_multimodal_use_cases.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/intro_multimodal_use_cases.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/53/X_logo_2023_original.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/intro_multimodal_use_cases.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/intro_multimodal_use_cases.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>            


| | |
|-|-|
|Author(s) | [Saeed Aghabozorgi](https://github.com/saeedaghabozorgi) |

## Overview

In this notebook, you will explore a variety of different use cases enabled by multimodality with Gemini 1.5 Flash.

### Gemini

Gemini is a family of generative AI models developed by Google DeepMind that is designed for multimodal use cases. The Gemini API gives you access to the Gemini 1.0 Pro Vision, Gemini 1.0 Pro, Gemini 1.5 Pro and Gemini 1.5 Flash models.

### Gemini API in Vertex AI

The Gemini API in Vertex AI provides a unified interface for interacting with Gemini models. There are currently four models available in the Gemini API:

- **Gemini 1.0 Pro model** (`gemini-1.0-pro`): Designed to handle natural language tasks, multiturn text and code chat, and code generation.
- **Gemini 1.0 Pro Vision model** (`gemini-1.0-pro-vision`): Supports multimodal prompts. You can include text, images, and video in your prompt requests and get text or code responses.
- **Gemini 1.5 Pro model** (`gemini-1.5-pro`): A foundation model that performs well at a variety of multimodal tasks such as visual understanding, classification, summarization, and creating content from image, audio and video..
- **Gemini 1.5 Flash model** (`gemini-1.5-flash`): A purpose-built multimodal model that provides speed and efficiency for high-volume, quality, cost-effective apps.

For more information, see the [Generative AI on Vertex AI](https://cloud.google.com/vertex-ai/docs/generative-ai/learn/overview) documentation.


### Objectives

This notebook demonstrates a variety of multimodal use cases that Gemini can be used for.

#### Multimodal use cases

Compared to text-only LLMs, Gemini 1.5's multimodality can be used for many new use-cases:

Example use cases with **text and image(s)** as input:

- Detecting objects in photos
- Understanding screens and interfaces
- Understanding of drawing and abstraction
- Understanding charts and diagrams
- Recommendation of images based on user preferences
- Comparing images for similarities, anomalies, or differences

Example use cases with **text and video** as input:

- Generating a video description
- Extracting tags of objects throughout a video
- Extracting highlights/messaging of a video


### Costs

This tutorial uses billable components of Google Cloud:

- Vertex AI

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.


## Getting Started


### Install Vertex AI SDK for Python


In [None]:
%pip install --upgrade --user google-cloud-aiplatform

### Restart current runtime

To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which will restart the current kernel.

In [None]:
# Restart kernel after installs so that your environment can access the new packages
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Please wait until it is finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

If you are running this notebook on Google Colab, run the following cell to authenticate your environment. This step is not required if you are using [Vertex AI Workbench](https://cloud.google.com/vertex-ai-workbench).


In [None]:
import sys

# Additional authentication is required for Google Colab
if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth

    auth.authenticate_user()

### Define Google Cloud project information and initialize Vertex AI

Initialize the Vertex AI SDK for Python for your project:

In [1]:
# Define project information. 
# These variables store your Google Cloud project ID and the desired location for resources.
PROJECT_ID = "[your project here]"  # @param {type:"string"} 
# Replace "[your project here]" with your actual project ID.
# The @param tag indicates that this is a parameter that could be set in an interactive environment (like Colab).

LOCATION = "[your location here]"  # @param {type:"string"}
# Replace "[your location here]" with the region where you want to use Vertex AI (e.g., "us-central1").

# Import the Vertex AI library
import vertexai

# Initialize the Vertex AI SDK with your project and location information.
# This sets up the connection to your Google Cloud project for using Vertex AI services.
vertexai.init(project=PROJECT_ID, location=LOCATION)

### Import libraries


In [2]:
# Import necessary classes from the vertexai.generative_models module.
from vertexai.generative_models import (
    GenerationConfig,  # Used to configure the generation process (e.g., temperature, top_k).
    GenerativeModel,   # Represents a generative model (e.g., Gemini Pro, Gemini Pro Vision).
    Image,             # Represents an image, used for multimodal models.
    Part              # Represents a part of a multimodal input (can be text or image).
)

## Use the Gemini 1.5 Flash model

Gemini 1.5 Flash (`gemini-1.5-flash`) is a multimodal model that supports multimodal prompts. You can include text, image(s), and video in your prompt requests and get text or code responses.


### Load Gemini 1.5 Flash model


In [3]:
# Create a GenerativeModel instance for the "gemini-1.5-flash" model.
multimodal_model = GenerativeModel("gemini-1.5-flash")

# "gemini-1.5-flash" is the model identifier for a specific version of Google's Gemini model, 
# optimized for multimodal tasks.

### Define helper functions


In [4]:
import http.client  # Provides classes for working with HTTP connections (used for downloading image data).
import typing  # Used for type hinting, which helps with code readability and error checking.
import urllib.request  # Used for fetching data from URLs.

import IPython.display  # Used for displaying rich content (images, videos, etc.) in Jupyter environments.
from PIL import Image as PIL_Image  # PIL (Pillow) is a library for image manipulation. We import Image as PIL_Image to avoid naming conflicts.
from PIL import ImageOps as PIL_ImageOps  # ImageOps provides helpful image operations like resizing while maintaining aspect ratio.


def display_images(
    images: typing.Iterable[Image],
    max_width: int = 600,
    max_height: int = 350,
) -> None:
    """
    Displays a sequence of images. Resizes images if they are larger than the specified maximum width or height.

    Args:
        images: An iterable (e.g., list, tuple) containing Image objects (from the Vertex AI SDK).
        max_width: The maximum width (in pixels) for displayed images.
        max_height: The maximum height (in pixels) for displayed images.
    """
    for image in images:
        # Access the underlying PIL Image object from the Vertex AI Image object.
        # The _pil_image attribute holds the actual image data.
        pil_image = typing.cast(PIL_Image.Image, image._pil_image)

        # Check if the image is in RGB format (most common for display).
        if pil_image.mode != "RGB":
            # Convert the image to RGB mode.
            # RGB is widely supported in Jupyter and web environments.
            pil_image = pil_image.convert("RGB")

        # Get the width and height of the image.
        image_width, image_height = pil_image.size

        # Check if the image needs to be resized.
        if max_width < image_width or max_height < image_height:
            # Resize the image while preserving its aspect ratio using PIL_ImageOps.contain().
            # This ensures the image fits within the specified dimensions without distortion.
            pil_image = PIL_ImageOps.contain(pil_image, (max_width, max_height))

        # Display the image in the Jupyter Notebook using IPython.display.display().
        IPython.display.display(pil_image)


def get_image_bytes_from_url(image_url: str) -> bytes:
    """
    Downloads image data from a URL and returns it as bytes.

    Args:
        image_url: The URL of the image.

    Returns:
        The image data as a bytes object.
    """
    # Open the URL using urllib.request.urlopen().
    with urllib.request.urlopen(image_url) as response:
        # Cast the response to an http.client.HTTPResponse object for clarity.
        response = typing.cast(http.client.HTTPResponse, response)
        # Read the image data (bytes) from the response.
        image_bytes = response.read()
    # Return the image data.
    return image_bytes


def load_image_from_url(image_url: str) -> Image:
    """
    Loads an image from a URL and returns it as a Vertex AI Image object.

    Args:
        image_url: The URL of the image to load.

    Returns:
        A Vertex AI Image object representing the loaded image.
    """
    # Get the image data (bytes) from the URL using the function we defined earlier.
    image_bytes = get_image_bytes_from_url(image_url)
    # Create a Vertex AI Image object from the image bytes.
    return Image.from_bytes(image_bytes)


def display_content_as_image(content: str | Image | Part) -> bool:
    """
    Checks if the given content is a Vertex AI Image and displays it if it is.

    Args:
        content: The content to check and potentially display.
                It can be either a string, a Vertex AI Image, or a Vertex AI Part.

    Returns:
        True if the content was a Vertex AI Image and was displayed, False otherwise.
    """
    # Check if the content is an instance of the Vertex AI Image class.
    if not isinstance(content, Image):
        # If it's not an Image, return False (we didn't display anything).
        return False
    # If it is an Image, display it using the display_images function.
    display_images([content])
    # Return True (we displayed the image).
    return True


def display_content_as_video(content: str | Image | Part) -> bool:
    """
    Checks if the given content is a Vertex AI Part representing a video and displays it if it is.

    Args:
        content: The content to check and potentially display.
                It can be either a string, a Vertex AI Image, or a Vertex AI Part.

    Returns:
        True if the content was a Vertex AI Part representing a video and was displayed, False otherwise.
    """
    # Check if the content is an instance of the Vertex AI Part class.
    if not isinstance(content, Part):
        # If it's not a Part, return False.
        return False
    # Cast the content to a Part object to access its attributes.
    part = typing.cast(Part, content)
    # Extract the Google Cloud Storage file path from the Part.
    # The file_uri is expected to be in the format "gs://bucket_name/path/to/video.mp4"
    file_path = part.file_data.file_uri.removeprefix("gs://")
    # Construct the public URL for the video stored in Google Cloud Storage.
    video_url = f"https://storage.googleapis.com/{file_path}"
    # Display the video using IPython.display.Video.
    IPython.display.display(IPython.display.Video(video_url, width=600))
    # Return True because we displayed the video.
    return True


def print_multimodal_prompt(contents: list[str | Image | Part]):
    """
    Displays a multimodal prompt in a user-friendly way, handling text, images, and videos.

    Args:
        contents: A list of content items to be sent to the Gemini model.
                Each item can be a string (text), a Vertex AI Image, or a Vertex AI Part.
    """
    for content in contents:
        # Try to display the content as an image.
        if display_content_as_image(content):
            # If it was displayed as an image, move to the next item.
            continue
        # Try to display the content as a video.
        if display_content_as_video(content):
            # If it was displayed as a video, move to the next item.
            continue
        # If it's not an image or video, assume it's text and print it.
        print(content)

## Image understanding across multiple images

One of the capabilities of Gemini is being able to reason across multiple images.

This is an example of using Gemini to calculate the total cost of groceries using an image of fruits and a price list:


In [None]:
# Define the URLs of the images we'll be using.
image_grocery_url = "https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/banana-apple.jpg"
image_prices_url = "https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/pricelist.jpg"

# Load the images from the URLs using the load_image_from_url function we defined earlier.
image_grocery = load_image_from_url(image_grocery_url)
image_prices = load_image_from_url(image_prices_url)

# Define the text prompts that will be part of our multimodal input.
instructions = "Instructions: Consider the following image that contains fruits:"
prompt1 = "How much should I pay for the fruits given the following price list?"
prompt2 = """
Answer the question through these steps:
Step 1: Identify what kind of fruits there are in the first image.
Step 2: Count the quantity of each fruit.
Step 3: For each grocery in first image, check the price of the grocery in the price list.
Step 4: Calculate the subtotal price for each type of fruit.
Step 5: Calculate the total price of fruits using the subtotals.

Answer and describe the steps taken:
"""

# Assemble the multimodal prompt content.
# This list will be sent to the Gemini model. It includes text and image data.
contents = [
    instructions,
    image_grocery,  # The image of the fruits
    prompt1,
    image_prices,  # The image of the price list
    prompt2,  # Detailed instructions for the model
]

# Generate content using the Gemini model.
# We use stream=True to get the response as it's being generated (streaming).
responses = multimodal_model.generate_content(contents, stream=True)

# Print the prompt that was sent to the model.
print("-------Prompt--------")
print_multimodal_prompt(contents)

# Print the model's generated response.
print("\n-------Response--------")
for response in responses:
    print(response.text, end="")

## Understanding Screens and Interfaces

Gemini can also extract information from appliance screens, UIs, screenshots, icons, and layouts.

For example, if you input an image of a stove, you can ask Gemini to provide instructions to help a user navigate the UI and respond in different languages:


In [None]:
# Define the URL of the image of the stove.
image_stove_url = "https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/stove.jpg"

# Load the image of the stove from the URL.
image_stove = load_image_from_url(image_stove_url)

# Define the prompt for the model.
# We're asking for instructions to reset the clock in English and French,
# and also to describe the location of buttons.
prompt = """Help me to reset the clock on this appliance?
Provide the instructions in English and French.
If instructions include buttons, also explain where those buttons are physically located.
"""

# Create the content list for the multimodal model.
# It includes the image of the stove and the text prompt.
contents = [image_stove, prompt]

# Call the Gemini model to generate content based on the prompt.
# stream=True enables streaming output.
responses = multimodal_model.generate_content(contents, stream=True)

# Display the prompt that was sent to the model.
print("-------Prompt--------")
print_multimodal_prompt(contents)

# Print the model's response as it's generated.
print("\n-------Response--------")
for response in responses:
    print(response.text, end="")

Note: The response may not be completely accurate, as the model may hallucinate; however, the model is able to identify the location of buttons and translate in a single query. To mitigate hallucinations, one approach is to ground the LLM with retrieval-augmented generation, which is outside the scope of this notebook.


## Understanding entity relationships in technical diagrams

Gemini has multimodal capabilities that enable it to understand diagrams and take actionable steps, such as optimization or code generation. This example demonstrates how Gemini can decipher an entity relationship (ER) diagram, understand the relationships between tables, identify requirements for optimization in a specific environment like BigQuery, and even generate corresponding code.


In [None]:
# Define the URL of the image containing the ER diagram.
image_er_url = "https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/er.png"

# Load the ER diagram image from the URL.
image_er = load_image_from_url(image_er_url)

# Define the prompt for the model, asking it to document the ER diagram.
prompt = "Document the entities and relationships in this ER diagram."

# Create the content list for the multimodal model.
# It includes the prompt and the ER diagram image.
contents = [prompt, image_er]

# Define a GenerationConfig for more deterministic output.
# We are setting a low temperature to reduce randomness,
# and also setting top_p and top_k for sampling from the most likely tokens.
generation_config = GenerationConfig(
    temperature=0.1,  # Controls the randomness of the output. Lower values make the output more deterministic.
    top_p=0.8,  # Limits the tokens considered to those with a cumulative probability up to this value (e.g., 0.8 means the top 80%).
    top_k=40,  # Considers only the top 'k' tokens with the highest probabilities.
    candidate_count=1,  # Specifies the number of response candidates to generate (we only want one here).
    max_output_tokens=2048,  # Sets the maximum length of the generated response.
)

# Call the Gemini model to generate content based on the prompt and configuration.
responses = multimodal_model.generate_content(
    contents,
    generation_config=generation_config,
    stream=True,  # Use streaming output.
)

# Print the prompt that was sent to the model.
print("-------Prompt--------")
print_multimodal_prompt(contents)

# Print the model's response as it's being generated.
print("\n-------Response--------")
for response in responses:
    print(response.text, end="")

## Recommendations based on multiple images

Gemini is capable of image comparison and providing recommendations. This may be useful in industries like e-commerce and retail.

Below is an example of choosing which pair of glasses would be better suited to an oval-shaped face:


In [None]:
# Define the URLs for the two images of glasses.
image_glasses1_url = "https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/glasses1.jpg"
image_glasses2_url = "https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/glasses2.jpg"

# Load the images of the glasses from their URLs.
image_glasses1 = load_image_from_url(image_glasses1_url)
image_glasses2 = load_image_from_url(image_glasses2_url)

# Define the text prompts for the model.
# We're asking for a recommendation based on face shape, providing images of two glasses,
# and requesting the reasoning in JSON format.
prompt1 = """
Which of these glasses you recommend for me based on the shape of my face?
I have an oval shape face.
----
Glasses 1:
"""
prompt2 = """
----
Glasses 2:
"""
prompt3 = """
----
Explain how you reach out to this decision.
Provide your recommendation based on my face shape, and reasoning for each in JSON format.
"""

# Create the content list for the multimodal model.
# The order matters here: prompt1, image1, prompt2, image2, prompt3.
contents = [prompt1, image_glasses1, prompt2, image_glasses2, prompt3]

# Generate content using the Gemini model with streaming output.
responses = multimodal_model.generate_content(contents, stream=True)

# Print the prompt that was sent to the model.
print("-------Prompt--------")
print_multimodal_prompt(contents)

# Print the model's response as it's generated.
print("\n-------Response--------")
for response in responses:
    print(response.text, end="")

## Similarity/Differences

Gemini can compare images and identify similarities or differences between objects.

The following example shows two scenes from Marienplatz in Munich, Germany that are slightly different. Gemini can compare between the images and find similarities/differences:


In [None]:
# Define the URLs for the two landmark images.
image_landmark1_url = "https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/landmark1.jpg"
image_landmark2_url = "https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/landmark2.jpg"

# Load the landmark images from their URLs.
image_landmark1 = load_image_from_url(image_landmark1_url)
image_landmark2 = load_image_from_url(image_landmark2_url)

# Define the text prompts for the model.
# We're asking the model to identify the landmark in Image 1,
# and then compare the two images, noting similarities and differences.
prompt1 = """
Consider the following two images:
Image 1:
"""
prompt2 = """
Image 2:
"""
prompt3 = """
1. What is shown in Image 1? Where is it?
2. What is similar between the two images?
3. What is difference between Image 1 and Image 2 in terms of the contents or people shown?
"""

# Create the content list for the multimodal model,
# including the prompts and images in the desired order.
contents = [prompt1, image_landmark1, prompt2, image_landmark2, prompt3]

# Define a GenerationConfig for a more deterministic and focused response.
# We use a temperature of 0.0 for maximum determinism.
generation_config = GenerationConfig(
    temperature=0.0,  # Very low temperature for a very deterministic output (less creative, more factual).
    top_p=0.8,  # Limits the considered tokens to those with a cumulative probability up to this value.
    top_k=40,  # Considers only the top 'k' tokens with the highest probabilities.
    candidate_count=1,  # Specifies the number of response candidates to generate (we want only one).
    max_output_tokens=2048,  # Sets the maximum length of the generated response.
)

# Call the Gemini model to generate content based on the prompt and configuration.
responses = multimodal_model.generate_content(
    contents,
    generation_config=generation_config,
    stream=True,  # Use streaming output.
)

# Print the prompt that was sent to the model.
print("-------Prompt--------")
print_multimodal_prompt(contents)

# Print the model's response as it's being generated.
print("\n-------Response--------")
for response in responses:
    print(response.text, end="")

## Generating a video description

Gemini can also extract tags throughout a video:

> Video: https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/mediterraneansea.mp4

In [None]:
# Define the prompt for the model.
# We're asking what's in the video, where to see it,
# and for similar-looking places.
prompt = """
What is shown in this video?
Where should I go to see it?
What are the top 5 places in the world that look like this?
"""

# Create a Part object representing the video.
# This is used to include video data in the prompt.
video = Part.from_uri(
    uri="gs://github-repo/img/gemini/multimodality_usecases_overview/mediterraneansea.mp4",  # The Google Cloud Storage URI of the video.
    mime_type="video/mp4",  # The MIME type of the video.
)

# Create the content list for the multimodal model.
# It includes the text prompt and the video Part.
contents = [prompt, video]

# Generate content using the Gemini model with streaming output.
responses = multimodal_model.generate_content(contents, stream=True)

# Print the prompt that was sent to the model.
print("-------Prompt--------")
print_multimodal_prompt(contents)

# Print the model's response as it's being generated.
print("\n-------Response--------")
for response in responses:
    print(response.text, end="")

> You can confirm that the location is indeed Antalya, Turkey by visiting the Wikipedia page: https://en.wikipedia.org/wiki/Antalya


## Extracting tags of objects throughout the video

Gemini can also extract tags throughout a video.

> Video: https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/photography.mp4


In [None]:
# Define the prompt for the model, asking for video analysis and tagging.
prompt = """
Answer the following questions using the video only:
- What is in the video?
- What is the action in the video?
- Provide 10 best tags for this video?
"""

# Create a Part object to represent the video data from a Google Cloud Storage URI.
video = Part.from_uri(
    uri="gs://github-repo/img/gemini/multimodality_usecases_overview/photography.mp4",  # The GCS URI of the video.
    mime_type="video/mp4",  # The MIME type of the video.
)

# Create the content list for the multimodal model, containing the prompt and video Part.
contents = [prompt, video]

# Call the Gemini model to generate content based on the prompt and video.
responses = multimodal_model.generate_content(contents, stream=True)

# Print the prompt that was sent to the model (including a visual representation of the video).
print("-------Prompt--------")
print_multimodal_prompt(contents)

# Print the model's response as it is generated.
print("\n-------Response--------")
for response in responses:
    print(response.text, end="")

## Asking more questions about a video

Below is another example of using Gemini to ask questions the video and return a JSON response.

> Video: https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/pixel8.mp4  
> _Note: Although this video contains audio, Gemini does not currently support audio input and will only answer based on the video._

In [None]:
# Define the prompt, asking specific questions about the video content and requesting JSON output.
prompt = """
Answer the following questions using the video only:
What is the profession of the main person?
What are the main features of the phone highlighted?
Which city was this recorded in?
Provide the answer JSON.
"""

# Create a Part object representing the video from its Google Cloud Storage URI.
video = Part.from_uri(
    uri="gs://github-repo/img/gemini/multimodality_usecases_overview/pixel8.mp4",  # The GCS URI of the video.
    mime_type="video/mp4",  # The MIME type of the video.
)

# Create the content list for the multimodal model, including the prompt and the video Part.
contents = [prompt, video]

# Call the Gemini model to generate content based on the prompt and video, using streaming output.
responses = multimodal_model.generate_content(contents, stream=True)

# Print the prompt that was sent to the model (including a visual representation of the video).
print("-------Prompt--------")
print_multimodal_prompt(contents)

# Print the model's response as it is generated.
print("\n-------Response--------")
for response in responses:
    print(response.text, end="")

## Retrieving extra information beyond the video


> Video: https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/ottawatrain3.mp4


In [None]:
# Define the prompt, asking questions about the train line shown in the video.
prompt = """
Which line is this?
where does it go?
What are the stations/stops of this line?
"""

# Create a Part object representing the video from its Google Cloud Storage URI.
video = Part.from_uri(
    uri="gs://github-repo/img/gemini/multimodality_usecases_overview/ottawatrain3.mp4",  # The GCS URI of the video.
    mime_type="video/mp4",  # The MIME type of the video.
)

# Create the content list for the multimodal model, including the prompt and the video Part.
contents = [prompt, video]

# Call the Gemini model to generate content based on the prompt and video, using streaming output.
responses = multimodal_model.generate_content(contents, stream=True)

# Print the prompt that was sent to the model (including a visual representation of the video).
print("-------Prompt--------")
print_multimodal_prompt(contents)

# Print the model's response as it is generated.
print("\n-------Response--------")
for response in responses:
    print(response.text, end="")

> You can confirm that this is indeed the Confederation Line on Wikipedia here: https://en.wikipedia.org/wiki/Confederation_Line
