In [None]:
import torch
ckpt = "/home/ubuntu/IP-Adapter/output_dir/checkpoint-80-0/ip_adapter.bin"
sd = torch.load(ckpt, map_location="cpu")
image_proj_sd = {}
ip_sd = {}
for k in sd:
    if k.startswith("unet"):
        pass
    elif k.startswith("image_proj_model"):
        image_proj_sd[k.replace("image_proj_model.", "")] = sd[k]
    elif k.startswith("adapter_modules"):
        ip_sd[k.replace("adapter_modules.", "")] = sd[k]

torch.save({"image_proj": image_proj_sd, "ip_adapter": ip_sd}, "ip_adapter.bin")

In [None]:
ip_sd

In [None]:
from types import MethodType

import torch
from diffusers import StableDiffusionControlNetPipeline, DDIMScheduler, AutoencoderKL, ControlNetModel
from PIL import Image

from ip_adapter import IPAdapter


In [None]:
base_model_path = "runwayml/stable-diffusion-v1-5"
vae_model_path = "stabilityai/sd-vae-ft-mse"
image_encoder_path = "models/image_encoder/"
ip_ckpt = "models/ip-adapter_sd15.bin"
# ip_ckpt = "/home/ubuntu/IP-Adapter/output_dir_finetune/checkpoint-2-0/ip_adapter.bin"
device = "cuda"

In [None]:
def image_grid(imgs, rows, cols):
    assert len(imgs) == rows*cols

    w, h = imgs[0].size
    grid = Image.new('RGB', size=(cols*w, rows*h))
    grid_w, grid_h = grid.size
    
    for i, img in enumerate(imgs):
        grid.paste(img, box=(i%cols*w, i//cols*h))
    return grid

noise_scheduler = DDIMScheduler(
    num_train_timesteps=1000,
    beta_start=0.00085,
    beta_end=0.012,
    beta_schedule="scaled_linear",
    clip_sample=False,
    set_alpha_to_one=False,
    steps_offset=1,
)
vae = AutoencoderKL.from_pretrained(vae_model_path).to(dtype=torch.float16)

## ControlNet Depth

In [None]:
# load controlnet
controlnet_model_path = "lllyasviel/control_v11f1p_sd15_depth"
controlnet = ControlNetModel.from_pretrained(controlnet_model_path, torch_dtype=torch.float16)
# load SD pipeline
pipe = StableDiffusionControlNetPipeline.from_pretrained(
    base_model_path,
    controlnet=controlnet,
    torch_dtype=torch.float16,
    scheduler=noise_scheduler,
    vae=vae,
    feature_extractor=None,
    safety_checker=None
)

In [None]:
# read image prompt
image = Image.open("assets/images/statue.png")
depth_map = Image.open("assets/structure_controls/depth.png")
image_grid([image.resize((256, 256)), depth_map.resize((256, 256))], 1, 2)

In [None]:
# load ip-adapter
ip_model = IPAdapter(pipe, image_encoder_path, ip_ckpt, device)

In [None]:
# generate image variations
images = ip_model.generate(pil_image=image, image=depth_map, num_samples=4, num_inference_steps=50, seed=42)
grid = image_grid(images, 1, 4)
grid

## ControlNet OpenPose

In [None]:
# load SD pipe
del pipe, ip_model
torch.cuda.empty_cache()
# load controlnet
controlnet_model_path = "lllyasviel/control_v11p_sd15_openpose"
controlnet = ControlNetModel.from_pretrained(controlnet_model_path, torch_dtype=torch.float16)
# load SD pipeline
pipe = StableDiffusionControlNetPipeline.from_pretrained(
    base_model_path,
    controlnet=controlnet,
    torch_dtype=torch.float16,
    scheduler=noise_scheduler,
    vae=vae,
    feature_extractor=None,
    safety_checker=None
)

In [None]:
# read image prompt
# image = Image.open("/home/ubuntu/Sprite-Sheet-Diffusion/data_handlabel/adventure_girl/dead/frame_1.png")
image = Image.open("/home/ubuntu/IP-Adapter/data/Test/in_sample/Ash Williams/motions/stand/reference.png")
image.resize((512, 512))

In [None]:
openpose_image = Image.open("/home/ubuntu/IP-Adapter/data/Test/in_sample/Ash Williams/motions/stand/poses/humanpose_3.png")
# openpose_image = Image.open("assets/structure_controls/openpose.png")
openpose_image.resize((512, 512))

In [None]:
# load ip-adapter
ip_model = IPAdapter(pipe, image_encoder_path, ip_ckpt, device)

In [None]:
# generate
images = ip_model.generate(pil_image=image, image=openpose_image, width=512, height=512, num_samples=4, num_inference_steps=100, seed=42)
grid = image_grid(images, 1, 4)
grid

Generate without finetuning

In [None]:
import torch
from PIL import Image

# Image grid helper function
def image_grid(imgs, rows, cols):
    assert len(imgs) == rows * cols

    w, h = imgs[0].size
    grid = Image.new('RGB', size=(cols * w, rows * h))
    for i, img in enumerate(imgs):
        grid.paste(img, box=(i % cols * w, i // cols * h))
    return grid

# Paths
ref_image_path = "/home/ubuntu/Sprite-Sheet-Diffusion/data_handlabel/ninjagirlnew/throw/frame_1.png"
pose_image_paths = [
    "/home/ubuntu/Sprite-Sheet-Diffusion/data_handlabel/ninjagirlnew/throw/humanpose_3.png",
    "/home/ubuntu/Sprite-Sheet-Diffusion/data_handlabel/ninjagirlnew/throw/humanpose_4.png",
    "/home/ubuntu/Sprite-Sheet-Diffusion/data_handlabel/ninjagirlnew/throw/humanpose_5.png",
]

# Generate corresponding GT paths by replacing 'humanpose' with 'frame' in the pose paths
gt_image_paths = [pose_path.replace("humanpose", "frame") for pose_path in pose_image_paths]

# Load and preprocess reference image
ref_image = Image.open(ref_image_path).resize((512, 512))

# Generate images for each pose
generated_images = []
pose_images = []
gt_images = []

for pose_path, gt_path in zip(pose_image_paths, gt_image_paths):
    pose_image = Image.open(pose_path).resize((512, 512))
    gt_image = Image.open(gt_path).resize((512, 512))  # Load and resize GT image
    generated_image = ip_model.generate(
        pil_image=ref_image,
        image=pose_image,
        width=512,
        height=512,
        num_samples=1,  # Single output per pose image
        num_inference_steps=100,
        seed=42
    )[0]  # Access the first (and only) generated sample
    pose_images.append(pose_image)
    generated_images.append(generated_image)
    gt_images.append(gt_image)

# Create visualization grid
rows = len(pose_images)  # Each row contains ref, pose, generated, and GT images
cols = 4  # Four columns: ref, pose, generated, GT
all_images = []

# Add images row-wise: ref, pose, generated, GT
for pose_image, generated_image, gt_image in zip(pose_images, generated_images, gt_images):
    all_images.extend([ref_image, pose_image, generated_image, gt_image])

# Ensure the grid dimensions match the images count
assert len(all_images) == rows * cols, (
    f"Number of images ({len(all_images)}) does not match grid size ({rows}x{cols})."
)

# Create and save the grid
grid = image_grid(all_images, rows, cols)
grid.save("output_grid_with_gt.png")
grid.show()


In [None]:
from PIL import Image

# Helper function to concatenate images horizontally
def horizontal_concat(images, resize_to=None):
    widths, heights = zip(*(img.size for img in images))
    total_width = sum(widths)
    max_height = max(heights)

    if resize_to:
        images = [img.resize(resize_to) for img in images]

    concat_image = Image.new('RGB', (total_width, max_height))
    x_offset = 0
    for img in images:
        concat_image.paste(img, (x_offset, 0))
        x_offset += img.width
    return concat_image

# Paths
ref_image_path = "/home/ubuntu/Sprite-Sheet-Diffusion/data_handlabel/ninjagirlnew/throw/frame_1.png"
pose_image_paths = [
    "/home/ubuntu/Sprite-Sheet-Diffusion/data_handlabel/ninjagirlnew/throw/humanpose_3.png",
    "/home/ubuntu/Sprite-Sheet-Diffusion/data_handlabel/ninjagirlnew/throw/humanpose_4.png",
    "/home/ubuntu/Sprite-Sheet-Diffusion/data_handlabel/ninjagirlnew/throw/humanpose_5.png",
]

# Load and preprocess reference image
ref_image = Image.open(ref_image_path).resize((512, 512))

# Generate corresponding GT paths by replacing 'humanpose' with 'frame' in the pose paths
gt_image_paths = [pose_path.replace("humanpose", "frame") for pose_path in pose_image_paths]

# Load pose and GT images
pose_images = [Image.open(pose_path).resize((512, 512)) for pose_path in pose_image_paths]
gt_images = [Image.open(gt_path).resize((512, 512)) for gt_path in gt_image_paths]

# Concatenate all pose images into one horizontal strip
concat_pose_image = horizontal_concat(pose_images)

# Generate the image with concatenated poses
generated_image = ip_model.generate(
    pil_image=ref_image,
    image=concat_pose_image,
    width=512 * len(pose_images),  # Width matches concatenated poses
    height=512,
    num_samples=1,  # Single output
    num_inference_steps=100,
    seed=42
)[0]

# Concatenate GT images into one horizontal strip
concat_gt_image = horizontal_concat(gt_images)

# Prepare visualization
# First row: Repeat ref_image 4 times horizontally
ref_row = horizontal_concat([ref_image] * len(pose_images))

# Second row: Concatenated pose images
pose_row = concat_pose_image

# Third row: Generated image (already single concatenated image)
generated_row = generated_image

# Fourth row: Concatenated GT images
gt_row = concat_gt_image

# Combine all rows vertically
final_grid = Image.new('RGB', (ref_row.width, ref_row.height * 4))
final_grid.paste(ref_row, (0, 0))
final_grid.paste(pose_row, (0, ref_row.height))
final_grid.paste(generated_row, (0, ref_row.height * 2))
final_grid.paste(gt_row, (0, ref_row.height * 3))

# Save and display the final grid
final_grid.save("output_grid_with_gt_4x1.png")
final_grid.show()


In [None]:
# import torch
# from PIL import Image
# from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
# from ip_adapter import IPAdapter  # Replace with your actual IPAdapter import

# # Helper function to create an image grid
# def image_grid(imgs, rows, cols):
#     assert len(imgs) == rows * cols

#     w, h = imgs[0].size
#     grid = Image.new('RGB', size=(cols * w, rows * h))
#     for i, img in enumerate(imgs):
#         grid.paste(img, box=(i % cols * w, i // cols * h))
#     return grid

# # Paths
# ref_image_path = "/home/ubuntu/Sprite-Sheet-Diffusion/data_handlabel/adventure_girl/jump/frame_1.png"
# pose_image_paths = [
#     "/home/ubuntu/Sprite-Sheet-Diffusion/data_handlabel/adventure_girl/jump/humanpose_2.png",
#     "/home/ubuntu/Sprite-Sheet-Diffusion/data_handlabel/adventure_girl/jump/humanpose_3.png",
#     "/home/ubuntu/Sprite-Sheet-Diffusion/data_handlabel/adventure_girl/jump/humanpose_4.png",
# ]

# # Model configuration
# base_model_path = "runwayml/stable-diffusion-v1-5"
# vae_model_path = "stabilityai/sd-vae-ft-mse"
# controlnet_model_path = "lllyasviel/control_v11p_sd15_openpose"
# image_encoder_path = "models/image_encoder/"
# ip_ckpt = "models/ip-adapter_sd15.bin"
# device = "cuda"

# # Load models
# torch.cuda.empty_cache()
# controlnet = ControlNetModel.from_pretrained(controlnet_model_path, torch_dtype=torch.float16).to(device)
# vae = AutoencoderKL.from_pretrained(vae_model_path).to(dtype=torch.float16)
# pipe = StableDiffusionControlNetPipeline.from_pretrained(
#     base_model_path,
#     controlnet=controlnet,
#     vae=vae,
#     torch_dtype=torch.float16
# ).to(device)

# # Adjust ControlNet influence
# pipe.controlnet_conditioning_scale = 0.8  # Default is 1.0; reduce for looser pose adherence

# # Initialize IPAdapter
# ip_model = IPAdapter(pipe, image_encoder_path, ip_ckpt, device)

# # Load and preprocess reference image
# ref_image = Image.open(ref_image_path).resize((512, 512))

# # Generate images for each pose
# generated_images = []
# pose_images = []
# for pose_path in pose_image_paths:
#     pose_image = Image.open(pose_path).resize((512, 512))
#     generated_image = ip_model.generate(
#         pil_image=ref_image,
#         image=pose_image,
#         width=512,
#         height=512,
#         num_samples=1,  # Single output per pose image
#         num_inference_steps=150,  # Increase steps for better quality
#         seed=42,
#         # adapter_weight=0.8,  # Increase IPAdapter influence
#         guidance_scale=10.0  # Adjust guidance scale for stricter adherence to reference
#     )[0]  # Access the first (and only) generated sample
#     pose_images.append(pose_image)
#     generated_images.append(generated_image)

# # Create visualization grid
# rows = len(pose_images)  # Each row contains ref, pose, and generated images
# cols = 3  # One column each for ref, pose, and generated image
# all_images = []

# # Add images row-wise: ref, pose, generated
# for pose_image, generated_image in zip(pose_images, generated_images):
#     all_images.extend([ref_image, pose_image, generated_image])

# # Ensure the grid dimensions match the images count
# assert len(all_images) == rows * cols, (
#     f"Number of images ({len(all_images)}) does not match grid size ({rows}x{cols})."
# )

# # Create and save the grid
# grid = image_grid(all_images, rows, cols)
# grid.save("output_grid.png")
# grid.show()


In [None]:
# loop through all, first frame as reference, generate one frame at a time

import os
from PIL import Image

# Helper function to concatenate images horizontally
def horizontal_concat(images, resize_to=None):
    widths, heights = zip(*(img.size for img in images))
    total_width = sum(widths)
    max_height = max(heights)

    if resize_to:
        images = [img.resize(resize_to) for img in images]

    concat_image = Image.new('RGB', (total_width, max_height))
    x_offset = 0
    for img in images:
        concat_image.paste(img, (x_offset, 0))
        x_offset += img.width
    return concat_image

# Paths
base_path = "/home/ubuntu/ssd_high_quality/test/characters"
output_folder_name = "ipadaptor"
resize_dim = (512, 512)

# Loop through each character folder
for character_name in os.listdir(base_path):
    character_path = os.path.join(base_path, character_name)
    if not os.path.isdir(character_path):
        continue
    
    # Loop through each motion folder
    motions_path = os.path.join(character_path, "motions")
    for motion_name in os.listdir(motions_path):
        motion_path = os.path.join(motions_path, motion_name)
        if not os.path.isdir(motion_path):
            continue
        
        # Paths for ground_truth, poses, and output folders
        ground_truth_path = os.path.join(motion_path, "ground_truth")
        poses_path = os.path.join(motion_path, "poses")
        output_path = os.path.join(motion_path, output_folder_name)

        os.makedirs(output_path, exist_ok=True)

        # Load reference image (first frame in ground_truth)
        ref_image_path = os.path.join(ground_truth_path, "frame_1.png")
        if not os.path.exists(ref_image_path):
            print(f"Reference image not found: {ref_image_path}")
            continue
        ref_image = Image.open(ref_image_path).resize(resize_dim)

        # Loop through pose images in the poses folder
        for pose_image_name in sorted(os.listdir(poses_path)):
            if not pose_image_name.endswith(".png"):
                continue

            pose_image_path = os.path.join(poses_path, pose_image_name)
            pose_image = Image.open(pose_image_path).resize(resize_dim)

            # Generate prediction using ipadaptormodel
            generated_image = ip_model.generate(
                pil_image=ref_image,
                image=pose_image,
                width=resize_dim[0],
                height=resize_dim[1],
                num_samples=1,
                num_inference_steps=100,
                seed=42
            )[0]

            # Save the output image
            output_image_name = pose_image_name.replace("humanpose", "generated")
            output_image_path = os.path.join(output_path, output_image_name)
            generated_image.save(output_image_path)

        print(f"Processed motion: {motion_name} for character: {character_name}")


In [None]:
# loop through all, first frame as reference, generate all frames at once
import os
from PIL import Image

# Helper function to concatenate images horizontally
def horizontal_concat(images, resize_to=None):
    widths, heights = zip(*(img.size for img in images))
    total_width = sum(widths)
    max_height = max(heights)

    if resize_to:
        images = [img.resize(resize_to) for img in images]

    concat_image = Image.new('RGB', (total_width, max_height))
    x_offset = 0
    for img in images:
        concat_image.paste(img, (x_offset, 0))
        x_offset += img.width
    return concat_image

# Helper function to split a concatenated image back into frames
def split_horizontal_image(image, num_frames, frame_width=512, frame_height=512):
    frames = []
    for i in range(num_frames):
        left = i * frame_width
        right = left + frame_width
        frame = image.crop((left, 0, right, frame_height))
        frames.append(frame)
    return frames

# Paths
base_path = "/home/ubuntu/ssd_high_quality/test/characters"
output_folder_name = "ipadaptor_combinedgenerate"
resize_dim = (512, 512)

# Loop through each character folder
for character_name in os.listdir(base_path):
    character_path = os.path.join(base_path, character_name)
    if not os.path.isdir(character_path):
        continue
    
    # Loop through each motion folder
    motions_path = os.path.join(character_path, "motions")
    for motion_name in os.listdir(motions_path):
        motion_path = os.path.join(motions_path, motion_name)
        if not os.path.isdir(motion_path):
            continue
        
        # Paths for ground_truth, poses, and output folders
        ground_truth_path = os.path.join(motion_path, "ground_truth")
        poses_path = os.path.join(motion_path, "poses")
        output_path = os.path.join(motion_path, output_folder_name)

        os.makedirs(output_path, exist_ok=True)

        # Load reference image (first frame in ground_truth)
        ref_image_path = os.path.join(ground_truth_path, "frame_1.png")
        if not os.path.exists(ref_image_path):
            print(f"Reference image not found: {ref_image_path}")
            continue
        ref_image = Image.open(ref_image_path).resize(resize_dim)

        # Load all pose images and concatenate them
        pose_images = []
        for pose_image_name in sorted(os.listdir(poses_path)):
            if not pose_image_name.endswith(".png"):
                continue
            pose_image_path = os.path.join(poses_path, pose_image_name)
            pose_images.append(Image.open(pose_image_path).resize(resize_dim))

        if not pose_images:
            print(f"No pose images found in: {poses_path}")
            continue

        concat_pose_image = horizontal_concat(pose_images)

        # Generate combined output using ipadaptor
        generated_image = ip_model.generate(
            pil_image=ref_image,
            image=concat_pose_image,
            width=concat_pose_image.width,  # Total width of the concatenated image
            height=concat_pose_image.height,  # Height remains 512
            num_samples=1,
            num_inference_steps=100,
            seed=42
        )[0]

        # Split the generated image back into individual frames
        num_frames = len(pose_images)
        generated_frames = split_horizontal_image(
            generated_image,
            num_frames=num_frames,
            frame_width=resize_dim[0],
            frame_height=resize_dim[1]
        )

        # Save each frame to the output folder
        for i, frame in enumerate(generated_frames, start=1):
            output_image_path = os.path.join(output_path, f"generated_{i}.png")
            frame.save(output_image_path)

        print(f"Processed motion: {motion_name} for character: {character_name}")




In [None]:
import os
from PIL import Image

# Helper function to concatenate images horizontally
def horizontal_concat(images, resize_to=None):
    widths, heights = zip(*(img.size for img in images))
    total_width = sum(widths)
    max_height = max(heights)

    if resize_to:
        images = [img.resize(resize_to) for img in images]

    concat_image = Image.new('RGB', (total_width, max_height))
    x_offset = 0
    for img in images:
        concat_image.paste(img, (x_offset, 0))
        x_offset += img.width
    return concat_image

# Helper function to split a concatenated image back into frames
def split_horizontal_image(image, num_frames, frame_width=512, frame_height=512):
    frames = []
    for i in range(num_frames):
        left = i * frame_width
        right = left + frame_width
        frame = image.crop((left, 0, right, frame_height))
        frames.append(frame)
    return frames

# Base paths
base_path = "/home/ubuntu/IP-Adapter/data/Test/in_sample/adventure_girl/motions/dead"
resize_dim = (512, 512)
output_folder_name = "predict"

# Paths for ground_truth, poses, and output
ground_truth_path = os.path.join(base_path, "ground_truth")
poses_path = os.path.join(base_path, "poses")
output_path = os.path.join(base_path, output_folder_name)

os.makedirs(output_path, exist_ok=True)

# Check if reference.png exists
reference_image_path = os.path.join(base_path, "reference.png")
if os.path.exists(reference_image_path):
    ref_image = Image.open(reference_image_path).resize(resize_dim)
else:
    # Use the first frame in ground_truth as the reference image
    ground_truth_images = sorted([f for f in os.listdir(ground_truth_path) if f.endswith(".png")])
    if not ground_truth_images:
        raise ValueError(f"No images found in ground_truth folder: {ground_truth_path}")
    reference_image_path = os.path.join(ground_truth_path, ground_truth_images[0])
    ref_image = Image.open(reference_image_path).resize(resize_dim)

# Load all pose images and concatenate them
pose_images = []
for pose_image_name in sorted(os.listdir(poses_path)):
    if not pose_image_name.endswith(".png"):
        continue
    pose_image_path = os.path.join(poses_path, pose_image_name)
    pose_images.append(Image.open(pose_image_path).resize(resize_dim))

if not pose_images:
    raise ValueError(f"No pose images found in folder: {poses_path}")

# Concatenate pose images horizontally
concat_pose_image = horizontal_concat(pose_images)

# Use the model to generate the combined output
generated_image = ip_model.generate(
    pil_image=ref_image,
    image=concat_pose_image,
    width=concat_pose_image.width,  # Total width of the concatenated image
    height=concat_pose_image.height,  # Height remains the same as individual images
    num_samples=1,
    num_inference_steps=100,
    seed=42
)[0]

# Split the generated image back into individual frames
num_frames = len(pose_images)
generated_frames = split_horizontal_image(
    generated_image,
    num_frames=num_frames,
    frame_width=resize_dim[0],
    frame_height=resize_dim[1]
)

# Save each frame to the output folder
for i, frame in enumerate(generated_frames, start=1):
    output_image_path = os.path.join(output_path, f"predict_{i}.png")
    frame.save(output_image_path)

print(f"Processed motion: dead")
print(f"Generated images saved in: {output_path}")


In [None]:
import os
from PIL import Image
import re

# Helper function for natural sort
def natural_key(string):
    """Split string into chunks of numbers and text for natural sorting."""
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', string)]

# Helper function to concatenate images horizontally
def horizontal_concat(images, resize_to=None):
    widths, heights = zip(*(img.size for img in images))
    total_width = sum(widths)
    max_height = max(heights)

    if resize_to:
        images = [img.resize(resize_to) for img in images]

    concat_image = Image.new('RGB', (total_width, max_height))
    x_offset = 0
    for img in images:
        concat_image.paste(img, (x_offset, 0))
        x_offset += img.width
    return concat_image

# Helper function to split a concatenated image back into frames
def split_horizontal_image(image, num_frames, frame_width=512, frame_height=512):
    frames = []
    for i in range(num_frames):
        left = i * frame_width
        right = left + frame_width
        frame = image.crop((left, 0, right, frame_height))
        frames.append(frame)
    return frames

# Base paths
base_paths = [
    "/home/ubuntu/IP-Adapter/data/Test/in_sample",
    "/home/ubuntu/IP-Adapter/data/Test/out_sample"
]
resize_dim = (512, 512)
output_folder_name = "predict"

# Loop through each base path (in_sample and out_sample)
for base_path in base_paths:
    for character_name in os.listdir(base_path):
        character_path = os.path.join(base_path, character_name)
        if not os.path.isdir(character_path):
            continue
        
        # Loop through each motion folder
        motions_path = os.path.join(character_path, "motions")
        for motion_name in os.listdir(motions_path):
            motion_path = os.path.join(motions_path, motion_name)
            if not os.path.isdir(motion_path):
                continue
            
            # Paths for ground_truth, poses, and output
            ground_truth_path = os.path.join(motion_path, "ground_truth")
            poses_path = os.path.join(motion_path, "poses")
            output_path = os.path.join(motion_path, output_folder_name)

            os.makedirs(output_path, exist_ok=True)

            # 检查 predict 文件夹是否存在，并判断是否需要跳过生成步骤
            if not os.path.exists(output_path):
                os.makedirs(output_path, exist_ok=True)
                print(f"Creating predict folder for motion: {motion_name} for character: {character_name}")
            else:
                # 检查 predict 文件夹中是否已生成所有预测帧
                expected_frames = len(os.listdir(poses_path))
                existing_predict_frames = [
                    f for f in os.listdir(output_path) if f.startswith("predict_") and f.endswith(".png")
                ]
                if len(existing_predict_frames) == expected_frames:
                    print(f"Skipping motion: {motion_name} for character: {character_name} (already processed)")
                    continue


            # Check if reference.png exists
            reference_image_path = os.path.join(motion_path, "reference.png")
            if os.path.exists(reference_image_path):
                ref_image = Image.open(reference_image_path).resize(resize_dim)
            else:
                # Use the first frame in ground_truth as the reference image
                ground_truth_images = sorted(
                    [f for f in os.listdir(ground_truth_path) if f.endswith(".png")],
                    key=natural_key  # Use natural sorting here
                )
                if not ground_truth_images:
                    print(f"No ground_truth images found in: {ground_truth_path}")
                    continue
                reference_image_path = os.path.join(ground_truth_path, ground_truth_images[0])
                ref_image = Image.open(reference_image_path).resize(resize_dim)

            # Load all pose images
            pose_images = sorted(
                [f for f in os.listdir(poses_path) if f.endswith(".png")],
                key=natural_key  # Use natural sorting here
            )
            pose_images = [Image.open(os.path.join(poses_path, img)).resize(resize_dim) for img in pose_images]

            if not pose_images:
                print(f"No pose images found in: {poses_path}")
                continue

            # Concatenate pose images horizontally
            concat_pose_image = horizontal_concat(pose_images)

            # Use the model to generate the combined output
            generated_image = ip_model.generate(
                pil_image=ref_image,
                image=concat_pose_image,
                width=concat_pose_image.width,  # Total width of the concatenated image
                height=concat_pose_image.height,  # Height remains 512
                num_samples=1,
                num_inference_steps=100,
                seed=42
            )[0]

            # Split the generated image back into individual frames
            num_frames = len(pose_images)
            generated_frames = split_horizontal_image(
                generated_image,
                num_frames=num_frames,
                frame_width=resize_dim[0],
                frame_height=resize_dim[1]
            )

            # Save each frame to the output folder
            for i, frame in enumerate(generated_frames, start=1):
                output_image_path = os.path.join(output_path, f"predict_{i}.png")
                frame.save(output_image_path)

            print(f"Processed motion: {motion_name} for character: {character_name}")
