In [None]:
import os
import sys
sys.path.append(os.path.join(os.getcwd(), '../../'))

from transformers import T5EncoderModel, AutoTokenizer
from transformers import CLIPVisionModelWithProjection
from diffusers.models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
from diffusers.schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
from Mutiple_prompt_mutiple_scene.utils import load_feature_extractor_from_ckpt, load_transformer_multi_ti2v
from Mutiple_prompt_mutiple_scene.Model.CogVideoX_ti2v import CogVideoXTransformer3D_TI2V

from omegaconf import OmegaConf

model_dir = '../../excluded_dir/local_model/model_dir'
# 加载所有组件
tokenizer = AutoTokenizer.from_pretrained(model_dir, subfolder="tokenizer", local_files_only=True)
text_encoder = T5EncoderModel.from_pretrained(
    model_dir, subfolder="text_encoder", local_files_only=True
)
# image encoder
image_encoder = CLIPVisionModelWithProjection.from_pretrained(model_dir, subfolder='Clip4Clip', local_files_only=True)
os.path.join(model_dir, 'feature_extractor', 'config.yaml')

configs = OmegaConf.load(os.path.join(model_dir, 'feature_extractor', 'config.yaml')).model.Feature_Extraction_Module
feature_extractor = load_feature_extractor_from_ckpt(configs, os.path.join(model_dir, 'feature_extractor', 'feature_extraction_model_last.pth'))

# TODO: 将transformer的所有参数都传入config中,供pipline使用
transformer = CogVideoXTransformer3D_TI2V.from_pretrained(model_dir, subfolder='transformer', local_files_only=True)
scheduler = CogVideoXDPMScheduler.from_pretrained(model_dir, subfolder="scheduler", local_files_only=True)
vae = AutoencoderKLCogVideoX.from_pretrained(model_dir, subfolder="vae", local_files_only=True)

print('加载所有组件完成')

In [None]:
from CogVideoX_Muti_Prompt_pipline import *


pipe = CogVideoX_MultiPrompt_Pipeline(tokenizer=tokenizer, text_encoder=text_encoder, image_encoder=image_encoder, feature_extractor=feature_extractor, transformer=transformer,vae=vae, scheduler=scheduler)
pipe.scheduler = CogVideoXDPMScheduler.from_config(
        pipe.scheduler.config, timestep_spacing="trailing"
    )
pipe.enable_sequential_cpu_offload()
pipe.vae.enable_slicing()
pipe.vae.enable_tiling()

prompt = 'The scene is indoors with a view from above. The background is a beige carpet. A black bag is in the center of the scene.'
past_prompts = ['The scene is indoors with a view from above. The background is a beige carpet. A black bag is in the center of the scene. Actor1 is holding the bag, and the bag is filled with objects. Actor1 opens the bag and says, "Hey everybody, Nick here, and today I got a review for you of this little guy. Um, this is the silent pocket uh 20-liter pack. Um first off though, I want to thank very much Silent...',
                'The scene is indoors with a view from above. The background is a beige carpet. A black bag is in the center of the scene. Actor1 says "of course, here it is against your...']
past_videos = ['/root/autodl-tmp/Ours/Multiple scene/test_dataset/videos/0-ggn3z52oU_76/split/The Silent Pocket 20 Liter Faraday Pack A Quick Shabazz Review-Scene-002.mp4',
               '/root/autodl-tmp/Ours/Multiple scene/test_dataset/videos/0-ggn3z52oU_76/split/The Silent Pocket 20 Liter Faraday Pack A Quick Shabazz Review-Scene-004.mp4']
output = pipe(prompt=prompt, past_prompts=past_prompts, past_images=past_videos,
        height=480, width=720, num_frames=49,
        num_inference_steps=50, guidance_scale=6.0, image_guidance_scale=6.0, use_dynamic_cfg=True)
print(1)