In [None]:
import torch
from diffsynth import ModelManager, WanVideoPipeline, save_video, VideoData

In [2]:
trained_model_path = "models/lip_finetuned/wandb/diffsynth-studio/n1eqrg1d/checkpoints/model-step=700.ckpt"

### Load LoRA

In [None]:
model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cpu")
model_manager.load_models([
    "models/Wan-AI/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors",
    "models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth",
    "models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth",
])
model_manager.load_lora(trained_model_path, lora_alpha=1.0)
pipe = WanVideoPipeline.from_model_manager(model_manager, device="cuda")
pipe.enable_vram_management(num_persistent_param_in_dit=None)

video = pipe(
    prompt="i was telling my friend about how much i wanted to see him",
    negative_prompt="low quality, unclear facial expressions, blurry",
    num_inference_steps=50,
    seed=0, tiled=True
)
save_video(video, "video.mp4", fps=30, quality=5)

### Load FPFT

In [18]:
model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cpu")
model_manager.load_models([
    "models/Wan-AI/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors",  # Load base DiT first
    "models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth",
    "models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth",
])

# Get the DiT model from the manager
dit_model = model_manager.fetch_model("wan_video_dit")

# Now load and filter your fine-tuned checkpoint
checkpoint_path = "models/lip_finetuned/wandb/diffsynth-studio/n1eqrg1d/checkpoints/model-step=700.ckpt"
model_dict = torch.load(checkpoint_path)

# Filter out audio-related parameters
filtered_dict = {k: v for k, v in model_dict.items() if "audio" not in k}

# Directly load the filtered weights into the base model
# The prefix "denoising_model." is common in these checkpoints
model_weights = {k.replace("denoising_model.", ""): v for k, v in filtered_dict.items() 
                 if k.startswith("denoising_model.")}

# Load filtered weights into the DiT model
dit_model.load_state_dict(model_weights, strict=False)

# Create the pipeline
pipe = WanVideoPipeline.from_model_manager(model_manager, device="cuda")
pipe.enable_vram_management(num_persistent_param_in_dit=None)

Loading models from: models/Wan-AI/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors
    model_name: wan_video_dit model_class: WanModel
        This model is initialized with extra kwargs: {'has_image_input': False, 'patch_size': [1, 2, 2], 'in_dim': 16, 'dim': 1536, 'ffn_dim': 8960, 'freq_dim': 256, 'text_dim': 4096, 'out_dim': 16, 'num_heads': 12, 'num_layers': 30, 'eps': 1e-06}


    The following models are loaded: ['wan_video_dit'].
Loading models from: models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth
    model_name: wan_video_text_encoder model_class: WanTextEncoder
    The following models are loaded: ['wan_video_text_encoder'].
Loading models from: models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth
    model_name: wan_video_vae model_class: WanVideoVAE
    The following models are loaded: ['wan_video_vae'].
Using wan_video_dit from models/Wan-AI/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors.
Using wan_video_text_encoder from models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth.
Using wan_video_dit from models/Wan-AI/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors.
Using wan_video_vae from models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth.
No wan_video_image_encoder models available.
No wan_video_motion_controller models available.
No wan_video_vace models available.


In [None]:
video = pipe(
    prompt="A person talking to their friend at a well-lit cafe in the middle of New York. Ana yells at her friend, 'What is it that you've been doing all of this time?', while she looks at Ana's friend with a stern expression.",
    negative_prompt="...",
    num_inference_steps=50,
    seed=0, tiled=True
)
save_video(video, "video.mp4", fps=20, quality=5)

Prompt:  A person talking to their friend at a well-lit cafe in the middle of New York. Ana yells at her friend, 'What is it that you've been doing all of this time?', while she looks at Ana's friend with a stern expression.
Prompt:  ...


100%|██████████| 50/50 [02:47<00:00,  3.35s/it]
VAE decoding: 100%|██████████| 9/9 [00:08<00:00,  1.08it/s]
Saving video:   0%|          | 0/81 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Saving video: 100%|██████████| 81/81 [00:00<00:00, 314.00it/s]
