In [None]:
# Text-to-Visuals: Generating Videos from Text with Text2Video-Zero
# Using the Text2Video-Zero pipeline to generate video clips from text prompts.

**Imports**

In [2]:
import torch
from diffusers import TextToVideoZeroPipeline
from IPython.display import Video
import imageio
import numpy as np

**Setup and Pipeline Initialization**

In [3]:
# Load the pretrained Text2Video-Zero model with half-precision on GPU
model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model_index.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/492M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

scheduler_config.json:   0%|          | 0.00/308 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.72k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/806 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/547 [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

**Video Generation**

In [4]:
# Custom video generation prompt
prompt = (
    "A cinematic slow-motion shot of a luxury wristwatch resting on a reflective marble surface, "
    "with elegant lighting and rotating camera movement. The watch face illuminates, revealing intricate gears. "
    "Floating text highlights features: 'Swiss Made', 'Sapphire Crystal', 'Water Resistant'. High-end commercial feel."
)

# Generate frames from the prompt
result = pipe(prompt=prompt).images

# Convert and save video
result = [(r * 255).astype("uint8") for r in result]
imageio.mimsave("video1.mp4", result, fps=4)

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


  0%|          | 0/48 [00:00<?, ?it/s]

**Display of the created video**

In [5]:
Video("video1.mp4", embed=True)

# **Longer videos**

**Setup for Extended Generation**

In [21]:
model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda") #des poids en précision float16 pour réduire l’utilisation mémoire
seed = 0 #Graine aléatoire pour que les résultats soient reproductibles à chaque exécution.
video_length = 24  #24 ÷ 4fps = 6 seconds
chunk_size = 8 #Chaque morceau généré contiendra jusqu'à 8 frames (à l’exception du dernier morceau qui peut être plus petit)
prompt = (
    "An astronaut exploring the surface of Mars during sunset. "
    "The landscape glows orange and red, the astronaut walks slowly as dust blows in the wind. "
    "Camera pans slowly from left to right, showing Mars rovers in the distance. "
"Epic, cinematic tone with wide-angle shots."
)

Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

**Chunked Generation Loop**

In [22]:
# Generate the video chunk-by-chunk
result = [] #une liste pour stocker les frames générées par morceaux.
chunk_ids = np.arange(0, video_length, chunk_size - 1)  #une liste des positions de début de chaque morceau.
generator = torch.Generator(device="cuda")
for i in range(len(chunk_ids)): #Boucle sur chaque chunk à générer et affiche la progression.
    print(f"Processing chunk {i + 1} / {len(chunk_ids)}")
    ch_start = chunk_ids[i]
    ch_end = video_length if i == len(chunk_ids) - 1 else chunk_ids[i + 1] #les indices de début et de fin pour le chunk actuel.

    # Attach the first frame for Cross Frame Attention
    frame_ids = [0] + list(range(ch_start, ch_end)) #la liste des frames à générer pour ce chunk, en incluant toujours la frame 0 au début. Cela aide à garder une cohérence temporelle entre les morceaux.

    # Fix the seed for the temporal consistency
    generator.manual_seed(seed) #Fixe aléatoire pour chaque chunk pour maintenir la cohérence entre frames (cela évite des ruptures visuelles aléatoires entre morceaux)
    output = pipe(prompt=prompt, video_length=len(frame_ids), generator=generator, frame_ids=frame_ids)
    result.append(output.images[1:])

Processing chunk 1 / 4


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

Processing chunk 2 / 4


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

Processing chunk 3 / 4


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

Processing chunk 4 / 4


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/48 [00:00<?, ?it/s]

**Final Assembly and Save**

In [23]:
import imageio

# Concatenate chunks and save
result = np.concatenate(result) #concatène tous les morceaux de vidéo pour former la séquence complète
result = [(r * 255).astype("uint8") for r in result]
imageio.mimsave("mars_exploration.mp4", result, fps=4)


**Display of the created video**

In [24]:
from IPython.display import Video
Video("mars_exploration.mp4", embed=True)

# **Text2Video-Zero**

**What is it?**
A model that turns text prompts into short videos using Stable Diffusion + cross-frame attention (no video training needed).

**Why use it?**

No training required

Runs on standard GPUs

Good temporal consistency

Great for quick concept videos or creative clips

**Cons:**

Low FPS (usually 4)

Short video duration (6–12 sec)

Limited motion realism

Needs good GPU (can use lots of VRAM)

**Best for:**

Storyboarding

TikTok-style vertical clips

Prototyping ideas from text