In [None]:
# !git clone https://github.com/thigazholi/Moore-AnimateAnyone.git
# %cd Moore-AnimateAnyone
# !pip install -r requirements.txt
!python tools/download_weights.py

# 0.Libraries

In [None]:
import sys
sys.path.append(r'/content/Moore-AnimateAnyone')

import argparse
import os
from datetime import datetime
from pathlib import Path
from typing import List

import av
import numpy as np
import torch
import torchvision
from diffusers import AutoencoderKL, DDIMScheduler
from diffusers.pipelines.stable_diffusion import StableDiffusionPipeline
from einops import repeat
from omegaconf import OmegaConf
from PIL import Image
from torchvision import transforms
from transformers import CLIPVisionModelWithProjection

from configs.prompts.test_cases import TestCasesDict
from src.models.pose_guider import PoseGuider
from src.models.unet_2d_condition import UNet2DConditionModel
from src.models.unet_3d import UNet3DConditionModel
from src.pipelines.pipeline_pose2vid_long import Pose2VideoPipeline
from src.utils.util import get_fps, read_frames, save_videos_grid

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

# 1.Configuration

In [None]:
#_________________________model_paths_________________________

pretrained_base_model_path = "./pretrained_weights/stable-diffusion-v1-5/"
pretrained_vae_path = "./pretrained_weights/sd-vae-ft-mse"
image_encoder_path = "./pretrained_weights/image_encoder"
denoising_unet_path = "./pretrained_weights/denoising_unet.pth"
reference_unet_path = "./pretrained_weights/reference_unet.pth"
pose_guider_path = "./pretrained_weights/pose_guider.pth"
motion_module_path = "./pretrained_weights/motion_module.pth"
inference_config = "./configs/inference/inference_v2.yaml"

weight_dtype = torch.float16

# 2.Load models and create a pose2video pipeline

In [None]:
#________________________load_models____________________________

#________________________VAE____________________________
print("LOADING VAE")
vae = AutoencoderKL.from_pretrained(pretrained_vae_path).to("cuda", dtype=weight_dtype)

#________________________UNET2D____________________________
print("LOADING UNET2D")
reference_unet = UNet2DConditionModel.from_pretrained(pretrained_base_model_path,subfolder="unet",).to(dtype=weight_dtype, device="cuda")

inference_config_path = inference_config
infer_config = OmegaConf.load(inference_config_path)

#________________________UNET3D____________________________
print("LOADING UNET3D")
denoising_unet = UNet3DConditionModel.from_pretrained_2d(pretrained_base_model_path, motion_module_path,
                                                         subfolder="unet", unet_additional_kwargs=infer_config.unet_additional_kwargs,
                                                        ).to(dtype=weight_dtype, device="cuda")
#________________________POSE_GUIDER____________________________
print("LOADING POSE GUIDER")
pose_guider = PoseGuider(320, block_out_channels=(16, 32, 96, 256)).to(dtype=weight_dtype, device="cuda")

#________________________CLIP (reference net part)____________________________
print('CLIP VISION')
image_enc = CLIPVisionModelWithProjection.from_pretrained(image_encoder_path).to(dtype=weight_dtype, device="cuda")
sched_kwargs = OmegaConf.to_container(infer_config.noise_scheduler_kwargs)

#________________________SCHEDULER____________________________
print('SETTING SCHEDULER')
scheduler = DDIMScheduler(**sched_kwargs)

# load pretrained weights

print('LOADING PRETRAINED WEIGHTS')
denoising_unet.load_state_dict(torch.load(denoising_unet_path, map_location="cpu"),strict=False,)
reference_unet.load_state_dict(torch.load(reference_unet_path, map_location="cpu"),)
pose_guider.load_state_dict(torch.load(pose_guider_path, map_location="cpu"),)


print("CALLING PIPELINE")
pipe = Pose2VideoPipeline(
                              vae            =vae,
                              image_encoder  =image_enc,
                              reference_unet =reference_unet,
                              denoising_unet =denoising_unet,
                              pose_guider    =pose_guider,
                              scheduler      =scheduler,
                          )
pipe = pipe.to("cuda", dtype=weight_dtype)

LOADING VAE
LOADING UNET2D


Some weights of the model checkpoint were not used when initializing UNet2DConditionModel: 
 ['conv_norm_out.weight, conv_norm_out.bias, conv_out.weight, conv_out.bias']


LOADING UNET3D


  return self.fget.__get__(instance, owner)()


LOADING POSE GUIDER
CLIP VISION
SETTING SCHEDULER
LOADING PRETRAINED WEIGHTS
CALLING PIPELINE


# 3.Tweak params here path, width, height

In [None]:
#tweek
seed = 42
cfg = 3.5
steps = 30 #expedite

W = 512
H = 784
L = 24
fps = None

ref_image_path = r'./configs/inference/ref_images/anyone-10.png'
pose_video_path = r"./configs/inference/pose_videos/anyone-video-1_kps.mp4"


# 4.Generate and save the video here

In [None]:
#____________________________POSE TRANSFORM_________________________________________________

print('POSE VIDEO TO FRAME GENERATION')
ref_name = Path(ref_image_path).stem
pose_name = Path(pose_video_path).stem.replace("_kps", "")

print(f"image: {ref_image_path}, pose_name: {pose_video_path}")

ref_image_pil = Image.open(ref_image_path).convert("RGB")

pose_list = []
pose_tensor_list = []

pose_images = read_frames(pose_video_path)
src_fps = get_fps(pose_video_path)
width, height = W, H
L = len(pose_images)

print(f"pose video has {len(pose_images)} frames, with {src_fps} fps")
pose_transform = transforms.Compose([transforms.Resize((height, width)), transforms.ToTensor()])

for pose_image_pil in pose_images[: L]:
    pose_tensor_list.append(pose_transform(pose_image_pil))
    pose_list.append(pose_image_pil)

ref_image_tensor = pose_transform(ref_image_pil)  # (c, h, w)
ref_image_tensor = ref_image_tensor.unsqueeze(1).unsqueeze(0)  # (1, c, 1, h, w)
ref_image_tensor = repeat(ref_image_tensor, "b c f h w -> b c (repeat f) h w", repeat=L)

pose_tensor = torch.stack(pose_tensor_list, dim=0)  # (f, c, h, w)
pose_tensor = pose_tensor.transpose(0, 1)
pose_tensor = pose_tensor.unsqueeze(0)

print('SETTING GENERATOR')
generator = torch.manual_seed(seed)


print("Video PIPELINE")
video = pipe(   ref_image_pil,
                pose_list,
                width,
                height,
                L,
                steps,
                cfg,
                generator=generator,
            ).videos

video = torch.cat([ref_image_tensor, pose_tensor, video], dim=0)

#_________________________________SAVING THE FILE_____________________________________________

date_str = datetime.now().strftime("%Y%m%d")
time_str = datetime.now().strftime("%H%M")
save_dir_name = f"{time_str}--seed_{seed}-{W}x{H}"

save_dir = Path(f"output/{date_str}/{save_dir_name}")
save_dir.mkdir(exist_ok=True, parents=True)

save_videos_grid(
                      video,
                      f"{save_dir}/{ref_name}_{pose_name}_{H}x{W}_{int(cfg)}_{time_str}.mp4",
                      n_rows=3,
                      fps = src_fps,
                  )

POSE VIDEO TO FRAME GENERATION
image: ./configs/inference/ref_images/anyone-10.png, pose_name: ./configs/inference/pose_videos/anyone-video-1_kps.mp4
pose video has 200 frames, with 30 fps
SETTING GENERATOR
Video PIPELINE


  num_channels_latents = self.denoising_unet.in_channels


  0%|          | 0/30 [00:00<?, ?it/s]