### Install Vertex AI SDK for Python


In [None]:
! pip3 install --upgrade --user google-cloud-aiplatform requests

Ensure you restart the session after installing the packages to restart the run-time kernel.

### Import libraries


In [None]:
from vertexai.generative_models import (
    GenerativeModel,
    Image,
    Part,
)

### Load the Gemini 1.5 Pro model


In [None]:
model = GenerativeModel("gemini-1.5-pro")

### Generate text from image prompts

Retrieve an image and send it to the model using its streaming interface.

In [None]:
import requests
import IPython.display

# Retrieve image of an AWS invoice
url = "https://thefengs.com/wuchang/courses/cs410g/files/invoice.jpg"
response = requests.get(url)
image = Image.from_bytes(response.content)
IPython.display.display(image)

# Prompt for a JSON formatted output of the data in invoice.
# Package image and prompt together and send it to the model
prompt = "Convert invoice data into json format with appropriate json tags as required for the data in the image"
contents = [image, prompt]
responses = model.generate_content(contents, stream=True)

# Print responses as they are returned
print("\n-------Response--------")
for response in responses:
    print(response.text)


#### Interactive interface

We can use IPython's simple UI elements to implement an interactive interface. Experiment with queries against the model using it.


In [None]:
def get_image_from_url(image_url):
  response = requests.get(image_url)
  image = Image.from_bytes(response.content)
  return(image)

def query(url, prompt):
  image = get_image_from_url(url)
  print("-------Image--------")
  IPython.display.display(image)
  print("-------Prompt--------")
  print(prompt)
  contents = [get_image_from_url(url), prompt]
  responses = model.generate_content(contents, stream=True)
  # Print responses as they are returned
  print("\n-------Response--------")
  for response in responses:
      print(response.text)

from ipywidgets import widgets, Output

output = Output()
# Define button click event
def on_button_click(b):
    with output:
        output.clear_output()
        prompt = prompt_input.value
        url = url_input.value
        print(f"[+] Query: Prompt is {prompt}\n URL is {url}")
        query(url,prompt)

# Create widgets
prompt_input = widgets.Text(
    value='What is this?',
    placeholder='Enter prompt',
    description='Prompt:',
    disabled=False
)
url_input = widgets.Text(
    value="https://thefengs.com/wuchang/courses/cs430/go-programming-language-turns-two_gophers.jpg",
    placeholder='Enter image URL',
    description='URL:',
    disabled=False
)
# Create a button
button = widgets.Button(description="Run")
button.on_click(on_button_click)
# Display widgets
display(prompt_input, url_input, button)
display(output)

### Video processing

Models can also be used to analyze and generate video.  In this example, a short video of a city street is stored in a Google Cloud Storage bucket. Run the cell to examine it.

In [None]:
video_url = "https://storage.googleapis.com/cs430/chicago.mp4"
video_uri = "gs://cs430/chicago.mp4"

IPython.display.Video(video_url, width=450)

We can ask the vision model to analyze the video for us and to generate a response in specific output format (Javascript Object Notation).

In [None]:
prompt = """
Answer the following questions using the video only:
What are the cars in this video?
What is the red sculpture of?
Which city was this recorded in?
Provide the answer JSON.
"""

video = Part.from_uri(video_uri, mime_type="video/mp4")
contents = [prompt, video]

responses = model.generate_content(contents, stream=True)

for response in responses:
    print(response.text, end="")