In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import sys
sys.path.append('..')

import torch
from PIL import Image
from transformers import T5EncoderModel, T5Tokenizer
from diffusers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler, AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
from diffusers.utils import export_to_video

from img2vid_pipeline import CogVideoXImg2VidPipeline

%load_ext autoreload
%autoreload 2

In [2]:
pretrained_model_name_or_path = "NimVideo/cogvideox-2b-img2vid"

tokenizer = T5Tokenizer.from_pretrained(
    pretrained_model_name_or_path, subfolder="tokenizer"
)

text_encoder = T5EncoderModel.from_pretrained(
    pretrained_model_name_or_path, subfolder="text_encoder"
)

transformer = CogVideoXTransformer3DModel.from_pretrained(
    pretrained_model_name_or_path, subfolder="transformer"
)

vae = AutoencoderKLCogVideoX.from_pretrained(
    pretrained_model_name_or_path, subfolder="vae"
)

scheduler = CogVideoXDDIMScheduler.from_pretrained(
    pretrained_model_name_or_path, subfolder="scheduler"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
pipe = CogVideoXImg2VidPipeline(
    transformer=transformer,
    tokenizer=tokenizer,
    text_encoder=text_encoder,
    vae=vae,
    scheduler=scheduler,
)

pipe = pipe.to(dtype=torch.float16, device='cuda')

pipe.enable_model_cpu_offload()
pipe.enable_sequential_cpu_offload()
# pipe.vae.enable_slicing()
# pipe.vae.enable_tiling()

In [4]:
images_info = [
    {
        'image_path': '../resources/truck.jpg',
        'text': 'A truck is driving through a dirt road, showcasing its capability for off-roading. The scene captures the vehicle in an outdoor setting, surrounded by rugged terrain and open skies. This image highlights various aspects of the truck, including its tires and automotive features.',
    },
    {
        'image_path': '../resources/owl.jpg',
        'text': 'The image features an owl soaring gracefully in the sky. It captures the essence of wildlife, showcasing the majestic beauty of this bird of prey, specifically highlighting owls like the barn owl and screech owl in an outdoor setting. The scene emphasizes the connection between nature and these fascinating creatures.',
    },
]

In [7]:
for image_info in images_info:
    prompt = image_info['text']
    image = Image.open(image_info['image_path']).convert("RGB")
    video = pipe(
        image=image,
        prompt=prompt,
        num_videos_per_prompt=1, 
        num_inference_steps=50, 
        num_frames=49,  
        use_dynamic_cfg=False, 
        guidance_scale=6.0, 
        generator=torch.Generator().manual_seed(42),  
    ).frames[0]

    file_name = os.path.basename(image_info['image_path']).replace('.png', '.mp4').replace('.jpg', '.mp4').replace('.jpeg', '.mp4')
    export_to_video(video, file_name, fps=8)

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]