In [1]:
import torch
from mochi_pipeline import MochiPipeline
from diffusers.utils import export_to_video

pipe = MochiPipeline.from_pretrained("genmo/mochi-1-preview", variant="bf16", torch_dtype=torch.bfloat16).to("cuda")

# Enable memory savings
pipe.enable_vae_tiling()

  from .autonotebook import tqdm as notebook_tqdm

A mixture of bf16 and non-bf16 filenames will be loaded.
Loaded bf16 filenames:
[transformer/diffusion_pytorch_model.bf16-00001-of-00003.safetensors, transformer/diffusion_pytorch_model.safetensors.index.bf16.json, vae/diffusion_pytorch_model.bf16.safetensors, transformer/diffusion_pytorch_model.bf16-00002-of-00003.safetensors, transformer/diffusion_pytorch_model.bf16-00003-of-00003.safetensors]
Loaded non-bf16 filenames:
[text_encoder/model.safetensors.index.json, text_encoder/model-00001-of-00002.safetensors, text_encoder/model-00001-of-00004.safetensors, text_encoder/model-00002-of-00002.safetensors, text_encoder/model-00003-of-00004.safetensors, text_encoder/model-00004-of-00004.safetensors, text_encoder/model-00002-of-00004.safetensors
If this behavior is not expected, please check your folder structure.
Loading checkpoint shards: 100%|██████████| 3/3 [00:00<00:00, 33.68it/s]it/s]
Loading checkpoint shards: 100%|██████████| 4/4 [0

KeyboardInterrupt: 

In [None]:
from processor import MochiAttnProcessor2_0
prompt = "An apple camflagued in a field of flowers, the color of the apple is red, blended in to the flower makes it hard to see"
from transformers import T5TokenizerFast
tokenizer = T5TokenizerFast.from_pretrained("genmo/mochi-1-preview", subfolder="tokenizer")

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


In [None]:
index = tokenizer.tokenize(prompt).index("▁apple")
index 

1

In [None]:
for block in pipe.transformer.transformer_blocks:
    block.attn1.processor = MochiAttnProcessor2_0(token_index_of_interest=torch.tensor([index])) 
frames = pipe(prompt,
              negative_prompt="bad quality, ugly faces, moving camera, easy to swe, stand out, able to see",
              num_inference_steps=10,
              guidance_scale=9,
              num_frames=30).frames[0]

export_to_video(frames, "mochi.mp4", fps=30)

100%|██████████| 10/10 [00:37<00:00,  3.75s/it]
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


'mochi.mp4'

In [None]:
frames[0].size[0]//16 * frames[0].size[1]//16

1590

In [None]:
import pylab
import numpy as np

extracted_maps = []
maps = pipe.attention_maps  
for step in range(len(maps)):
    for layer in range(len(maps[step])):
        # print(maps[step][layer].shape) [B, H, L, D]
        map = maps[step][layer][0].mean(0)[0]
        extracted_maps.append(map.cpu().float().numpy().reshape(-1, frames[0].size[1]//16, frames[0].size[0]//16))
extracted_maps = np.array(extracted_maps)

In [None]:
extracted_maps.shape

(480, 5, 30, 53)

In [None]:
mask = (extracted_maps > extracted_maps.mean(axis=0) + extracted_maps.std(axis=0)).astype(np.float32) + (extracted_maps < extracted_maps.mean(axis=0) - extracted_maps.std(axis=0)).astype(np.float32)
extracted_maps[mask != 0] = np.nan
mean_map = np.nanmean(np.abs(extracted_maps), axis=0)

In [None]:
mean_map.shape

(5, 30, 53)

In [None]:
import cv2


def opening(x, kernel_size=3):
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (kernel_size, kernel_size))
    x = [cv2.morphologyEx(i, cv2.MORPH_OPEN, kernel) for i in x]
    return x

def blur(x, kernel_size=3):
    kernel = np.ones((kernel_size, kernel_size), np.float32) / (kernel_size * kernel_size)
    x = [cv2.filter2D(i, -1, kernel) for i in x]
    return x

def resize(x, size):
    return [cv2.resize(i, size) for i in x]
    
    
def normalize(x):
    return [cv2.normalize(i, None, 0, 1, cv2.NORM_MINMAX) for i in x]

def compose_frames(frames, maps):
    frames = [np.array(frame).astype(float)/255 for frame in frames]
    for i in range(len(frames)):
        map = cv2.cvtColor(maps[i//6], cv2.COLOR_GRAY2RGB).astype(float)       
        frames[i] = cv2.addWeighted(frames[i], 0.5, map, 0.5, 0)
    return frames
    
    
mean_map = opening(mean_map, kernel_size=3)
mean_map = blur(mean_map, kernel_size=3)
mean_map = resize(mean_map, size=(frames[0].size[0], frames[0].size[1]))
mean_map = normalize(mean_map)
video = compose_frames(frames, mean_map)
export_to_video(video, "mochi_attention.mp4", fps=30)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


'mochi_attention.mp4'