# Direct-a-Video Step-by-Step Instruction

## Import packages

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from transformers import CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL, DDIMScheduler
import imageio  # pip install imageio==2.9.0 imageio-ffmpeg==0.4.2

from src.unet_3d import MyUNet3DConditionModel as UNet3DConditionModel
from src.t2v_pipeline import TextToVideoSDPipeline, tensor2vid
from utils import *

## Setup

In [None]:
device = torch.device("cuda:0")
dtype = torch.float16  # recommended to use float16 for faster inference
seed = 1926

pretrained_model_name_or_path = "cerspense/zeroscope_v2_576w"  # base model path
cam_ckpt_path = "ckpt/unet_cam_model.ckpt"  # trained camera model path, see readme file for download link

output_dir = "outputs_test"  # output directory (for saving results and code)
os.makedirs(output_dir, exist_ok=True)
save_project_src_files(["src/*.py", "./*.py"], os.path.join(output_dir, 'code')) # save a copy of source code

num_inference_steps = 50
f = 24  # number of frames
h = 320 # height
w = 512 # width

## Load Models

In [None]:
## load unet
unet_orig = UNet3DConditionModel.from_pretrained(pretrained_model_name_or_path, subfolder="unet",torch_dtype=dtype)
unet_config = UNet3DConditionModel.load_config(pretrained_model_name_or_path, subfolder='unet')
unet_config['attention_type'] = 'cross_temp' # set attention type to cross_temp so that camera model can be initialized below
unet = UNet3DConditionModel.from_config(unet_config)

unet_orig_ckpt =unet_orig.state_dict()
unet_cam_ckpt = torch.load(cam_ckpt_path, map_location='cpu')

unet.load_state_dict({**unet_orig_ckpt, **unet_cam_ckpt}, strict=True)
unet.to(dtype=dtype)
del unet_orig, unet_cam_ckpt

## Set attn processors, including temporal cross attention and spatial cross attention
unet.set_direct_a_video_attn_processors()

## load other models
vae = AutoencoderKL.from_pretrained(pretrained_model_name_or_path, subfolder="vae", torch_dtype=dtype)
vae_scale_factor = 2 ** (len(vae.config.block_out_channels) - 1)
tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_name_or_path, subfolder='tokenizer', torch_dtype=dtype,)
text_encoder = CLIPTextModel.from_pretrained(pretrained_model_name_or_path, subfolder='text_encoder', torch_dtype=dtype,)
scheduler = DDIMScheduler.from_pretrained(pretrained_model_name_or_path, subfolder='scheduler', torch_dtype=dtype )
# scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)  # you may use other samplers other than DDIMScheduler


## make pipeline
pipeline = TextToVideoSDPipeline(
    vae=vae,
    text_encoder=text_encoder,
    tokenizer=tokenizer,
    unet=unet,
    scheduler=scheduler,
).to(device)  # .to(accelerator.device)

pipeline.set_progress_bar_config(disable=False)

E = pipeline.encode_pixels  # vae encoder alias, for debug use
D = pipeline.decode_latents  # vae decoder alias, for debug use

generator = None if seed is None else torch.Generator(device=device).manual_seed(seed)
neg_prompt = "low quality, ugly, blurry image, bad quality, low resolution, disfigured, bad anatomy, bad composition"

## Inference: Camera Motion Only

Set prompt and camera motion parameters here

In [None]:
prompt = "a waterfall in a beautiful forest with fall foliage, best quality, extremely detailed, national geographic."

cam_tx = 0.3  # x-pan ratio (-1~1), >0 for right, <0 for left
cam_ty = -0.2  # y-pan ratio (-1~1), >0 for down, <0 for up
cam_s = 0.85  # x-pan ratio (0.5~2), >1 for zoom in, <1 for zoom out

In [None]:
## Prepare cam_motion parameters
cam_motion = torch.tensor([[[float(cam_tx), float(cam_ty), float(cam_s)]]]).to(device=device, dtype=dtype)

## Run inference
out = pipeline(
    prompt=prompt, 
    negative_prompt=neg_prompt,
    cam_motion=cam_motion,
    cam_off_t=0.85,  # time step to turn off camera motion control
    cam_cfg=True,  # enable classifier-free guidance for camera motion control
    num_frames=f,
    num_inference_steps=50,
    width=w,
    height=h,
    # generator=[torch.Generator(device=device).manual_seed(seed)],
).frames

### Save video
save_name = os.path.join(output_dir, f"{prompt[:50]}_cam.mp4")
imageio.mimsave(save_name, out, fps=8)
print(f"Video saved to '{save_name}'")

## Inference: Object Motion Only

First let's set prompt and object bbox:

**Instructions on prompt:** 
* Use * to mark the object(s) word and the background word (optional), just append * right after the word.
    For example, "a tiger* and a bear* walking in snow*"
* If an object has more than one words, use ( ) to wrap them. E.g., a (white tiger) walking in (green grassland)"
* The mark * and ( ) can be used together, e.g., a tiger* and a (bear) walking in (green grassland)"
* The marked background word (if any) should always be the last marked word, as seen with the above examples.

**Instructions on bbox:** 

bbox describes the bounding box of objects, which depicts object's spatial-temporal motion trajectory in the video. 

We provide example bbox in the code below, you can run it directly. If you wish to create your own bbox, you may use our UI tool to draw boxes and save it to .npy file then load it as done in the code below. If you want to learn more details about the bbox, see instructions below:

* bbox is a list of tensors, the list length is the number of objects (exclude background). If bbox list contains more than one boxes, the boxes order be consistent with the marked object words in the prompt.
* Each tensor in bbox list should be in size of 24*4, where 24 is number of frames, 4 means [x1,y1,x2,y2], the normalized coordinates (value range 0~1) of box left-top and right-bottom corners in each frame. 

In [None]:
## Single object motion
prompt = "a horse* walking in grassland*"
bbox = [load_bbox("data/box_left_to_right.npy", f=f)] # load bbox from file

# Multi objects motion
prompt = "an horse* and a house* in grassland*"
bbox1 = load_bbox("data/box_left_to_right.npy", f=f)
bbox2 = load_bbox("data/box_static.npy", f=f)
bbox = [bbox1, bbox2]

## Let's visualize the object bbox
visualize_bbox(bbox, height=h, width=w, frames=f)

In [None]:
### Prepare the kwargs for object attention modulation
# you can adjust attn_lambda to control the modulation strength 
# adjust attn_tau to control the modulation timestep (turn off when t < attn_tau)
obj_motion_attn_kwargs = pipeline.prepare_obj_motion_attn_kwargs(prompt=prompt, bbox=bbox,
                                                                attn_lambda=25, attn_tau=0.95,
                                                                h=h,w=w, f=f)

### Run inference
out = pipeline(
    prompt=prompt, 
    negative_prompt=neg_prompt,
    # cam_motion=cam_motion,
    # cam_off_t=0.85,  
    # cam_cfg=True,  
    cross_attention_kwargs=obj_motion_attn_kwargs,  #### add object motion attention control
    num_frames=f,
    num_inference_steps=50,
    width=w,
    height=h,
    # generator=[torch.Generator(device=device).manual_seed(seed)],
).frames

### Save video
save_name = os.path.join(output_dir, f"{prompt[:50].replace('*','')}_obj.mp4")
imageio.mimsave(save_name, out, fps=8)
print(f"Video saved to '{save_name}'")

## Inference: Camera + Object Motion

In [None]:
## Camera motion
cam_tx = 0.3  # x-pan ratio (-1~1), >0 for right, <0 for left
cam_ty = 0  # y-pan ratio (-1~1), >0 for down, <0 for up
cam_s = 1.2  # x-pan ratio (0.5~2), >1 for zoom in, <1 for zoom out


## Single object motion
prompt = "a zebra* walking along the river*"
bbox = [load_bbox("data/box_left_to_right.npy", f=f)] # load bbox from file

## Multi objects motion
# prompt = "a tiger* and a bear* walking in grass*"
# bbox1 = load_bbox("./box/11.npy", f=f)
# bbox2 = load_bbox("./box/22.npy", f=f)
# bbox = [bbox1, bbox2]

In [None]:
## Prepare cam_motion parameters
cam_motion = torch.tensor([[[float(cam_tx), float(cam_ty), float(cam_s)]]]).to(device=device, dtype=dtype)

## Prepare the kwargs for object motion
obj_motion_attn_kwargs = pipeline.prepare_obj_motion_attn_kwargs(prompt=prompt, bbox=bbox,
                                                                attn_lambda=25, attn_tau=0.95,
                                                                h=h,w=w, f=f)
## Run inference
out = pipeline(
    prompt=prompt, 
    negative_prompt=neg_prompt,

    cam_motion=cam_motion,
    cam_off_t=0.85,  # time step to turn off camera motion control
    cam_cfg=True,  # enable classifier-free guidance for camera motion control

    cross_attention_kwargs=obj_motion_attn_kwargs,  # add object motion attention control

    num_frames=f,
    num_inference_steps=50,
    width=w,
    height=h,
    # generator=[torch.Generator(device=device).manual_seed(seed)],
).frames

### Save video
save_name = os.path.join(output_dir, f"{prompt[:50].replace('*','')}_cam_obj.mp4")
imageio.mimsave(save_name, out, fps=8)