In [1]:
import torch
from PIL import Image
from diffusers import StableVideoDiffusionPipeline
from diffusers.utils import export_to_gif, export_to_video, load_image
from diffusers.utils.import_utils import is_xformers_available
from tqdm import tqdm
import numpy as np
import os
import sys
sys.path.append('..')
from utils.video_logging import save_video_frames_as_frames_parallel, save_video_frames_as_frames, save_video_frames_as_mp4


torch.backends.cuda.matmul.allow_tf32 = True

# device='cuda:2'
device='cuda'
weight_dtype = torch.float16

pipeline = StableVideoDiffusionPipeline.from_pretrained(
    "stabilityai/stable-video-diffusion-img2vid", variant="fp16"
).to(device, weight_dtype)

# # https://huggingface.co/docs/diffusers/v0.28.2/en/tutorials/fast_diffusion
# if is_xformers_available():
#     print("enable xformers memory efficient attention")
#     pipeline.unet.enable_xformers_memory_efficient_attention()
# else:
#     print("install xformers to enable memory efficient attention")

# pipeline.enable_model_cpu_offload()
# 20-25% speedup
# torch._inductor.config.conv_1x1_as_mm = True
# torch._inductor.config.coordinate_descent_tuning = True
# torch._inductor.config.epilogue_fusion = False
# torch._inductor.config.coordinate_descent_check_all_directions = True
# pipeline.unet.to(memory_format=torch.channels_last)
# pipeline.vae.to(memory_format=torch.channels_last)
# pipeline.unet = torch.compile(pipeline.unet, mode="max-autotune", fullgraph=True)
# pipeline.vae.decode = torch.compile(pipeline.vae.decode, mode="max-autotune", fullgraph=True)


  from .autonotebook import tqdm as notebook_tqdm
  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")
  deprecate("Transformer2DModelOutput", "1.0.0", deprecation_message)
Loading pipeline components...: 100%|██████████| 5/5 [00:01<00:00,  3.41it/s]


In [5]:
print(f'SVD unet params: {sum([p.numel() for p in pipeline.unet.parameters()]) / 1e6} M')
print(f'SVD vae encoder params: {sum([p.numel() for p in pipeline.vae.encoder.parameters()]) / 1e6} M')
print(f'SVD vae decoder params: {sum([p.numel() for p in pipeline.vae.decoder.parameters()]) / 1e6} M')
print(f'SVD image encoder params: {sum([p.numel() for p in pipeline.image_encoder.parameters()]) / 1e6} M')
# test 1-step inference speed

SVD unet params: 1524.623082 M
SVD vae params: 34.163592 M
SVD vae params: 63.579183 M
SVD image encoder params: 632.0768 M


In [None]:
# test 1-step inference speed
img = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png")
generator = torch.manual_seed(42)
for _ in range(10):
    frames = pipeline(img, decode_chunk_size=7, generator=generator, motion_bucket_id=127, fps=7, num_inference_steps=1).frames[0]
# export_to_gif(frames, "generated_1step.gif")

In [7]:
# img = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png")
# img_names = [
#     "bird-8014191_1280.jpg",
#     "dog-7396912_1280.jpg",
#     "girl-4898696_1280.jpg",
#     "leaf-7260246_1280.jpg",
#     "power-station-6579092_1280.jpg",
#     "rocket.png",
#     "training-8122941_1280.jpg",
#     "woman-4549327_1280.jpg",
#     "woman-5667299_1280.jpg"
# ]
img_names = os.listdir('../assets/images')
output_dir = 'svd'
generator = torch.manual_seed(42)
for name in img_names:
    img = load_image(f"../assets/images/{name}")
    img = img.resize(576, 1024)

    # motion_bucket_id: The higher the number the more motion will be in the video.
    # fps: The higher the fps the less choppy the video will be.
    for i in [4, 8, 16, 25]:
        with torch.no_grad():
            for max_guidance_scale in [2.5]: #[1.1, 1.5, 2.0, 3.0]:
                # default max_guidance_scale=3.0, do_cfg is True, therefore bs=2
                frames = pipeline(img, decode_chunk_size=14, generator=generator, max_guidance_scale=max_guidance_scale, motion_bucket_id=127, fps=7, num_inference_steps=i).frames[0]
                export_to_video(frames, f"{output_dir}/generated_{name[:-4]}_step_{i}_max_{max_guidance_scale}.gif")
# frames = pipeline(img, decode_chunk_size=8, generator=generator).frames[0]
# export_to_gif(frames, "generated_1step.gif")
# export_to_video(frames, "rocket.mp4", fps=7)

100%|██████████| 1/1 [00:00<00:00,  1.20it/s]
100%|██████████| 2/2 [00:01<00:00,  1.24it/s]
100%|██████████| 4/4 [00:03<00:00,  1.24it/s]
100%|██████████| 8/8 [00:06<00:00,  1.24it/s]
100%|██████████| 16/16 [00:12<00:00,  1.24it/s]
100%|██████████| 1/1 [00:00<00:00,  1.24it/s]
100%|██████████| 2/2 [00:01<00:00,  1.24it/s]
100%|██████████| 4/4 [00:03<00:00,  1.24it/s]
100%|██████████| 8/8 [00:06<00:00,  1.24it/s]
100%|██████████| 16/16 [00:12<00:00,  1.24it/s]
100%|██████████| 1/1 [00:00<00:00,  1.23it/s]
100%|██████████| 2/2 [00:01<00:00,  1.24it/s]
100%|██████████| 4/4 [00:03<00:00,  1.24it/s]
100%|██████████| 8/8 [00:06<00:00,  1.24it/s]
100%|██████████| 16/16 [00:12<00:00,  1.24it/s]
100%|██████████| 1/1 [00:00<00:00,  1.23it/s]
100%|██████████| 2/2 [00:01<00:00,  1.24it/s]
100%|██████████| 4/4 [00:03<00:00,  1.24it/s]
100%|██████████| 8/8 [00:06<00:00,  1.24it/s]
100%|██████████| 16/16 [00:12<00:00,  1.24it/s]
100%|██████████| 1/1 [00:00<00:00,  1.24it/s]
100%|██████████| 2/2 [00:0

In [None]:
from IPython.display import Image

# Replace 'generated.gif' with the path to your GIF file
gif_path = 'generated.gif'
Image(url=gif_path)

In [10]:
import os
from PIL import Image

ucf_frames = '/home/yiming/project/MyProjects/VisionGen/SVD/ucf-101_frames'

def process_ucf_frames(ucf_frames, num_inference_steps=4, max_process=10):
    ucf_generated_svd = f'./ucf_generated_svd_{num_inference_steps}steps'

    # Create the output directory if it doesn't exist
    os.makedirs(ucf_generated_svd, exist_ok=True)

    # Get a list of all image files in the ucf_frames directory
    image_files = []
    for root, dirs, files in os.walk(ucf_frames):
        for file in files:
            if file.endswith('.jpg'):  # Assuming the frames are saved as .jpg
                image_files.append(os.path.join(root, file))

    num_processed = 0
    # Process each image file
    for image_file in image_files:
        img = Image.open(image_file)
        img = img.resize((1024, 576))
        generator = torch.manual_seed(42)

        with torch.no_grad():
            frames = pipeline(img, decode_chunk_size=7, generator=generator, motion_bucket_id=127, fps=7, num_inference_steps=num_inference_steps).frames[0]
            resized_frames = [frame.resize((320, 240)) for frame in frames]
            output_gif_path = os.path.join(ucf_generated_svd, f"{os.path.basename(image_file)[:-4]}.mp4")
            export_to_video(resized_frames, output_gif_path, fps=7)
        num_processed += 1
        if num_processed > max_process:
            break

# Example usage
# process_ucf_frames(ucf_frames)
# process_ucf_frames(ucf_frames, num_inference_steps=8)
# process_ucf_frames(ucf_frames, num_inference_steps=16)
# process_ucf_frames(ucf_frames, num_inference_steps=25, max_process=200000)


100%|██████████| 25/25 [00:20<00:00,  1.24it/s]
 60%|██████    | 15/25 [00:12<00:08,  1.24it/s]