In [None]:
import os, sys
sys.path.append("..")
sys.argv = [
    "fpack_train_network.py",
    "--dit", "/home/yo564250/workspace/ComfyUI/models/diffusion_models/FramePackI2V_HY_bf16.safetensors",
    "--vae", "/home/yo564250/workspace/ComfyUI/models/vae/hunyuan-video-t2v-720p-vae.pt",
    "--text_encoder1", "/home/yo564250/workspace/ComfyUI/models/text_encoders/llava_llama3_fp16.safetensors",
    "--text_encoder2", "/home/yo564250/workspace/ComfyUI/models/text_encoders/clip_l.safetensors",
    "--image_encoder", "/home/yo564250/workspace/ComfyUI/models/clip_vision/sigclip_vision_patch14_384.safetensors",
    "--dataset_config", "/groups/chenchen/patrick/OpenS2V-Nexus/datasets/OpenS2V_part1_test3_2.toml",
    "--sdpa", "--mixed_precision", "bf16", "--one_frame",
    "--optimizer_type", "adamw8bit", 
    "--learning_rate", "2e-4", 
    "--gradient_checkpointing",
    "--timestep_sampling", "shift", 
    "--weighting_scheme", "none", 
    "--discrete_flow_shift", "3.0",
    "--max_data_loader_n_workers", "8", 
    "--persistent_data_loader_workers",
    "--network_module", "networks.lora_framepack", 
    "--network_dim", "32",
    "--max_train_epochs", "16", 
    "--save_every_n_epochs", "1", 
    "--seed", "42",
    "--sample_prompts", "/groups/chenchen/patrick/OpenS2V-Nexus/datasets/test3_2_sample_prompts.txt",
    "--sample_every_n_epochs", "1", 
    "--sample_at_first",
    "--output_dir", "outputs/training/idmask_control_lora", 
    "--output_name", "idmask_control_lora_test1",
    "--logging_dir", "outputs/training/idmask_control_lora/logs", 
    "--log_with", "tensorboard",
    "--remove_embedding", "--use_attention_controlimage_masking"
]

from pathlib import Path
from tqdm import tqdm
import importlib
from omegaconf import OmegaConf
from argparse import Namespace
import math, json
import numpy as np
from multiprocessing import Value
from PIL import Image, ImageDraw
import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F
from safetensors.torch import load_file
import lovely_tensors as lt
lt.monkey_patch()

from musubi_tuner.dataset import config_utils
from musubi_tuner.dataset.config_utils import BlueprintGenerator, ConfigSanitizer
from musubi_tuner.networks import lora_framepack
from musubi_tuner.hv_train_network import collator_class, setup_parser_common, read_config_from_file, load_prompts
from musubi_tuner.fpack_train_network import framepack_setup_parser, FramePackNetworkTrainer

parser = setup_parser_common()
parser = framepack_setup_parser(parser)
args = parser.parse_args()
args = read_config_from_file(args, parser)
args.vae_dtype = "float16"  # fixed
args.dit_dtype = "bfloat16"  # fixed
args.sample_solver = "unipc"  # for sample generation, fixed to unipc

device = torch.device('cuda:0')

trainer = FramePackNetworkTrainer()
trainer.handle_model_specific_args(args)

In [None]:
blueprint_generator = BlueprintGenerator(ConfigSanitizer())
user_config = config_utils.load_user_config(args.dataset_config)
blueprint = blueprint_generator.generate(user_config, args, architecture=trainer.architecture)
train_dataset_group = config_utils.generate_dataset_group_by_blueprint(blueprint.dataset_group, training=True)

collator = collator_class(Value("i", 0), Value("i", 0), None)
train_dataloader = DataLoader(
    train_dataset_group,
    batch_size=1,
    shuffle=True,
    collate_fn=collator,
    num_workers=4,
    persistent_workers=args.persistent_data_loader_workers,
)
batch = next(iter(train_dataloader))
# for batch in train_dataloader:
#     pass

```bash
accelerate launch --num_cpu_threads_per_process 1 --mixed_precision bf16 fpack_train_network.py \
    --dit /home/yo564250/workspace/ComfyUI/models/diffusion_models/FramePackI2V_HY_bf16.safetensors \
    --vae /home/yo564250/workspace/ComfyUI/models/vae/hunyuan-video-t2v-720p-vae.pt \
    --text_encoder1 /home/yo564250/workspace/ComfyUI/models/text_encoders/llava_llama3_fp16.safetensors \
    --text_encoder2 /home/yo564250/workspace/ComfyUI/models/text_encoders/clip_l.safetensors \
    --image_encoder /home/yo564250/workspace/ComfyUI/models/clip_vision/sigclip_vision_patch14_384.safetensors \
    --dataset_config /groups/chenchen/patrick/OpenS2V-Nexus/datasets/OpenS2V_part1_test3_2.toml \
    --sdpa --mixed_precision bf16 --one_frame \
    --optimizer_type adamw8bit --learning_rate 2e-4 --gradient_checkpointing \
    --timestep_sampling shift --weighting_scheme none --discrete_flow_shift 3.0 \
    --max_data_loader_n_workers 8 --persistent_data_loader_workers --split_attn \
    --network_module networks.lora_framepack --network_dim 32 \
    --max_train_epochs 16 --save_every_n_epochs 1 --seed 42 \
    --sample_prompts /groups/chenchen/patrick/OpenS2V-Nexus/datasets/test3_2_sample_prompts.txt \
    --sample_every_n_steps 250 --sample_at_first \
    --output_dir outputs/training/idmask_control_lora_wrope_v1 --output_name idmask_control_lora_wrope_v1_4 \
    --logging_dir outputs/training/idmask_control_lora_wrope_v1/logs --log_with tensorboard \
    --remove_embedding --use_attention_controlimage_masking --sample_with_latentbbox_rope
```

In [3]:
# prepare dtype
weight_dtype = torch.bfloat16
dit_dtype = torch.bfloat16
dit_weight_dtype = torch.bfloat16

# get embedding for sampling images
vae_dtype = torch.float16
sample_parameters = trainer.process_sample_prompts(args, Namespace(**{'device': device}), args.sample_prompts)

# Load VAE model for sampling images: VAE is loaded to cpu to save gpu memory
vae = trainer.load_vae(args, vae_dtype=vae_dtype, vae_path=args.vae)
vae.requires_grad_(False)
vae.eval()
vae.to(device)

transformer = trainer.load_transformer(
    Namespace(**{'device': device}), args, args.dit, "torch", args.split_attn, device, dit_weight_dtype
)
transformer.eval()
transformer.requires_grad_(False)

# network = lora_framepack.create_arch_network(
#     1.0, args.network_dim, args.network_alpha,
#     vae, None, transformer,
#     neuron_dropout=args.network_dropout,
# )
# # apply network to DiT
# network.apply_to(None, transformer, apply_text_encoder=False, apply_unet=True)
weights_sd = load_file("/home/yo564250/workspace/whisperer/related/framepackbase/musubi-tuner/outputs/training/idmask_control_lora/idmask_control_lora_test3-000008.safetensors")
module = lora_framepack.create_arch_network_from_weights(
    1.0, weights_sd, unet=transformer, for_inference=True
)
module.merge_to(None, transformer, weights_sd, weight_dtype, "cpu")


transformer.enable_gradient_checkpointing()
# network.enable_gradient_checkpointing()  # may have no effect

INFO:musubi_tuner.fpack_train_network:cache Text Encoder outputs for sample prompt: /groups/chenchen/patrick/OpenS2V-Nexus/datasets/test3_2_sample_prompts.txt
INFO:musubi_tuner.frame_pack.framepack_utils:Loading text encoder 1 tokenizer
INFO:musubi_tuner.frame_pack.framepack_utils:Loading text encoder 1 from /home/yo564250/workspace/ComfyUI/models/text_encoders/llava_llama3_fp16.safetensors
INFO:musubi_tuner.frame_pack.framepack_utils:Loading text encoder 2 tokenizer
INFO:musubi_tuner.frame_pack.framepack_utils:Loading text encoder 2 from /home/yo564250/workspace/ComfyUI/models/text_encoders/clip_l.safetensors
INFO:musubi_tuner.fpack_train_network:cache Text Encoder outputs for prompt: A man seated on a green cushioned chair in a dimly lit room with a backdrop that includes climbing holds. He is dressed casually in a red shirt and jeans. In front of him on the cushion are several items, including a deck of playing cards, a small black bag, and a pair of gloves. The man appears to be en

In [4]:
with torch.inference_mode():
    trainer.sample_image_inference(
        Namespace(**{'device': device}), args, transformer, dit_dtype, vae, 
        ".", sample_parameters[1], 0, 0
    )

INFO:musubi_tuner.hv_train_network:prompt: Three individuals seated closely together in what appears to be a casual indoor setting. The person in the center is wearing a gray hoodie with pink accents and has light-colored hair. To the left, another individual is dressed in a red shirt with a graphic design, and to the right, a person with long dark hair is wearing a light-colored top. The background includes a wall with a colorful mural or artwork, and the room has a modern, cozy ambiance with soft lighting. The individuals are engaged in conversation, with the central figure speaking and the others listening and reacting with smiles and nods. The camera remains stationary, capturing the scene from a medium shot perspective.
INFO:musubi_tuner.hv_train_network:height: 720
INFO:musubi_tuner.hv_train_network:width: 1280
INFO:musubi_tuner.hv_train_network:frame count: 1
INFO:musubi_tuner.hv_train_network:sample steps: 25
INFO:musubi_tuner.hv_train_network:guidance scale: 10.0
INFO:musubi_t

INFO:musubi_tuner.fpack_train_network:Encoding control image: /groups/chenchen/patrick/OpenS2V-Nexus/datasets/test3_2/RH9DTExtz1s_segment_55_step1-0-73_step2-0-73_step4_step5_step6/source_facecrop_0.png
INFO:musubi_tuner.fpack_train_network:Encoding entity mask: /groups/chenchen/patrick/OpenS2V-Nexus/datasets/test3_2/RH9DTExtz1s_segment_55_step1-0-73_step2-0-73_step4_step5_step6/target_bodmask_0.png
INFO:musubi_tuner.fpack_train_network:Set index for clean latent 1x: ['0']
INFO:musubi_tuner.fpack_train_network:Set index for target: 9
INFO:musubi_tuner.fpack_train_network:No clean_latents_2x
INFO:musubi_tuner.fpack_train_network:No clean_latents_4x
INFO:musubi_tuner.fpack_train_network:One frame inference. clean_latent: torch.Size([1, 16, 1, 34, 25]) latent_indices: tensor[1, 1] i64 [[9]], clean_latent_indices: tensor[1, 1] i64 [[0]], num_frames: 1


  0%|          | 0/25 [00:00<?, ?it/s]

INFO:musubi_tuner.fpack_train_network:Waiting for 5 seconds to finish block swap
INFO:musubi_tuner.fpack_generate_video:Decoding video...
INFO:musubi_tuner.fpack_generate_video:Bulk decoding or one frame inference
INFO:musubi_tuner.fpack_generate_video:Decoded. Pixel shape torch.Size([1, 3, 1, 720, 1280])
