In [None]:
import importlib

from dotenv import load_dotenv

if not importlib.util.find_spec("utils"):
    !pip install -qqq git+https://github.com/xtreamsrl/genai-for-engineers-class

load_dotenv()

# Veo 3 Video Generation


Veo 3.1 by Google is an advanced video generation model available on Vertex AI. It enables developers to create high-quality, realistic videos from text and image prompts, supporting a wide range of visual styles and including features like dialogue and audio generation. Veo 3.1 excels in producing videos with detailed visuals and lifelike physics, making it suitable for cinematic and creative applications.

Main features:
- Generates videos from text and image prompts
- Supports various visual styles and cinematic effects
- Includes audio and dialogue generation
- Offers prompt enhancement for improved video quality
- Allows customization of camera angles, movements, and lens effects
- Supports different aspect ratios, resolutions, and durations

In this notebook, we use Google Gen AI SDK and APIs to interact with Veo 3.1, demonstrating its capabilities through practical examples and exploring its features programmatically.

### Import libraries

In [None]:
import os
import time
from pathlib import Path

import requests
from IPython.display import Markdown, Video, display
from google import genai

### Define a helper function to display media

In [None]:
def create_video(video: bytes, video_tmp_path: Path = None) -> Video:
    temp_video_path = video_tmp_path or Path("temp_video.mp4")
    temp_video_path.write_bytes(video)
    return Video(temp_video_path, embed=True, width=600, height=400)

### Load the video generation model

In [None]:
client = genai.Client()

video_model = (
    "veo-3.1-fast-generate-preview"  # Use "veo-3.1-generate-preview" for higher quality
)
image_model = "gemini-2.5-flash-image"
gemini_model = "gemini-2.5-pro"

## Generate videos
Now, you'll generate videos from text and/or image prompts. You can get started with your own prompts or complete the section below to optimize your prompts with some established best practices.

Video generation with Veo 3.1 is significantly slower than generating text or images because it requires more computational resources and time to produce high-quality, realistic videos. As a result, the Google APIs do not return the final video immediately. Instead, you must poll the API at intervals to check if the video generation operation is complete. This polling mechanism ensures you can retrieve the result as soon as it is ready, without blocking your code execution.


In [None]:
def wait_for_video_completion(operation: genai.types.GenerateVideosOperation) -> Video:
    waited_time = 0
    wait_interval = 10
    while not operation.done:
        time.sleep(wait_interval)
        waited_time += wait_interval
        print(f"Waited {waited_time} seconds. Checking operation...")
        operation = client.operations.get(operation)
        print(operation)

    video_url = f"{operation.response.generated_videos[0].video.uri}&key={os.getenv('GOOGLE_API_KEY')}"
    video_response = requests.get(video_url, stream=True)
    return create_video(video_response.content)

In [None]:
prompt = "A realistic video of a cat shooting with a baseball bat in a stadium, cinematic lighting, high detail"

response = client.models.generate_videos(
    model=video_model,
    prompt=prompt,
)

wait_for_video_completion(response)

## Optimize your prompt: Text-to-video

By considering the following options in your prompt, you can use Veo to create higher quality videos that more closely resemble your desired outcome. Learn more about advanced [prompting techniques for Veo 3](https://medium.com/google-cloud/veo-3-a-detailed-prompting-guide-867985b46018). To get started specify the following, or leave them as `None` if they don't align with your specific goals.
- **Subject:** The "who" or "what" of your video
- **Action:** Describe movements, interactions, etc.
- **Scene:** The "where" and "when" of your video
- **Camera angles:** The shot's viewpoint
- **Camera movements:** For a more cinematic/dynamic experience
- **Lens effects:** How the camera "sees" the world
- **Style:** The video's artistic filter
- **Temporal elements:** To imply changes in time
- **Audio:** Various sound effects or dialogue that guides the visuals through sound

In [None]:
subject = "a detective"  # @param {type: 'string'}
action = "interrogating a rubber duck"  # @param {type: 'string'}
scene = "in a dark interview room"  # @param {type: 'string'}

camera_angle = "Over-the-Shoulder Shot"  # @param ["None", "Eye-Level Shot", "Low-Angle Shot", "High-Angle Shot", "Bird's-Eye View", "Top-Down Shot", "Worm's-Eye View", "Dutch Angle", "Canted Angle", "Close-Up", "Extreme Close-Up", "Medium Shot", "Full Shot", "Long Shot", "Wide Shot", "Establishing Shot", "Over-the-Shoulder Shot", "Point-of-View (POV) Shot"]
camera_movement = "Zoom (In)"  # @param ["None", "Static Shot (or fixed)", "Pan (left)", "Pan (right)", "Tilt (up)", "Tilt (down)", "Dolly (In)", "Dolly (Out)", "Zoom (In)", "Zoom (Out)", "Truck (Left)", "Truck (Right)", "Pedestal (Up)", "Pedestal (Down)", "Crane Shot", "Aerial Shot", "Drone Shot", "Handheld", "Shaky Cam", "Whip Pan", "Arc Shot"]
lens_effects = "None"  # @param ["None", "Wide-Angle Lens (e.g., 24mm)", "Telephoto Lens (e.g., 85mm)", "Shallow Depth of Field", "Bokeh", "Deep Depth of Field", "Lens Flare", "Rack Focus", "Fisheye Lens Effect", "Vertigo Effect (Dolly Zoom)"]
style = "Cinematic"  # @param ["None", "Photorealistic", "Cinematic", "Vintage", "Japanese anime style", "Claymation style", "Stop-motion animation", "In the style of Van Gogh", "Surrealist painting", "Monochromatic black and white", "Vibrant and saturated", "Film noir style", "High-key lighting", "Low-key lighting", "Golden hour glow", "Volumetric lighting", "Backlighting to create a silhouette"]
temporal_elements = "None"  # @param ["None", "Slow-motion", "Fast-paced action", "Time-lapse", "Hyperlapse", "Pulsating light", "Rhythmic movement"]

sound_effects = "Ticking clock"  # @param ["None", "Sound of a phone ringing", "Water splashing", "Soft house sounds", "Ticking clock", "City traffic and sirens", "Waves crashing", "Quiet office hum"]
dialogue = "Where were you last night?"  # @param {type: 'string'}

Now, you'll use Gemini to take all of these keywords and combine them into a detailed Veo prompt.

In [None]:
maybe_keywords = [
    subject,
    action,
    scene,
    camera_angle,
    camera_movement,
    lens_effects,
    style,
    temporal_elements,
    sound_effects,
]
keywords = [k for k in maybe_keywords if k != "None"]

if dialogue != "":
    keywords.append(dialogue)

gemini_prompt = f"""
You are an expert video prompt engineer for Google's Veo model.
Your task is to construct the most effective prompt using the following keywords.
Every single keyword MUST be included.
Synthesize them into a single, cohesive, and cinematic instruction.
Output ONLY the final prompt string, without any introduction or explanation.

Keywords: {",".join(keywords)}
"""
response = client.models.generate_content(
    model=gemini_model,
    contents=gemini_prompt,
)

# Set Gemini's response in a prompt variable
prompt = response.text
display(Markdown(response.text))

response = client.models.generate_videos(
    model=video_model,
    prompt=prompt,
)

wait_for_video_completion(response)

### Generate videos from an image


#### Download the starting image

You can also generate a video by starting with an input image.

In this example, you'll first generate an image using Gemini 2.5 Flash Image, and then use that image as the starting point for a video generation with Veo 3.1.

In [None]:
from io import BytesIO
from PIL import Image

prompt = "Panning wide shot of a calico kitten sleeping in the sunshine"

# Step 1: Generate an image with Nano Banana.
response = client.models.generate_content(
    model="gemini-2.5-flash-image",
    contents=prompt,
)

# Step 2: Generate a video with Veo 3.1 using the image from Step 1 as input.
for part in response.candidates[0].content.parts:
    if part.text is not None:
        print(part.text)
    elif part.inline_data is not None:
        image_bytes = part.inline_data.data
        image = Image.open(BytesIO(image_bytes))
        image.save("generated_image.png")

        response = client.models.generate_videos(
            model=video_model,
            source=genai.types.GenerateVideosSource(
                prompt=prompt,
                image=genai.types.Image(image_bytes=image_bytes, mime_type="image/png"),
            ),
        )

        wait_for_video_completion(response)