# Experiment Sample Generation
This file is responsible for generating samples used in the experiments for the report. We set `noun_list` and `pose_list` to make sample dataset.
Then we sample 50 samples from each baselines. The results will be saved in `./samples`.
Prompt-to-Prompt sampling is handled separately in `sampling_p2p.ipynb` because it requires a different environment setup.


# Setting

In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np

from tqdm import tqdm
from einops import rearrange, repeat
from omegaconf import OmegaConf

from diffusers import DDIMScheduler, ControlNetModel, StableDiffusionPipeline, StableDiffusionControlNetPipeline

from MasaCtrl.masactrl.diffuser_utils import MasaCtrlPipeline, MasaCtrlControlNetPipeline
from MasaCtrl.masactrl.masactrl_utils import AttentionBase
from MasaCtrl.masactrl.masactrl_utils import regiter_attention_editor_diffusers
from MasaCtrl.masactrl.masactrl import MutualSelfAttentionControl

from torchvision.utils import save_image
from torchvision.io import read_image
from pytorch_lightning import seed_everything

from torchvision.transforms import ToPILImage

import glob

torch.cuda.set_device(0)  # set the GPU device

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import sys
import random
import datetime as dt

import numpy as np
import torch

seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True


In [3]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

start_code = torch.randn([1, 4, 64, 64], device=device)
start_code_masa = start_code.expand(2, -1, -1, -1) # expand to batch size 2

In [4]:
noun_list = ["boy", "girl", "man", "woman", "child", 
 "farmer", "soldier", "firefighter", "pirate", "basketball player"]
pose_list = ["dancing", "flexing", 'jumping', 'laying', 'tposing']

In [5]:
baselines = ["controlnet", "fixed_seed", "masactrl", "masactrl_controlnet", 'test']
for baseline in baselines:
    if not os.path.exists(f"sampling/{baseline}"):
        os.mkdir(f"sampling/{baseline}")
        os.mkdir(f"sampling/{baseline}/source")
        os.mkdir(f"sampling/{baseline}/edit")

# MasaCtrl+ControlNet

In [6]:
pose_path = "sampling/pose_selected"
source_path = "sampling/masactrl_controlnet/source"
edit_path = "sampling/masactrl_controlnet/edit"

for noun in noun_list:
    # model initialization
    model_path = "stable-diffusion-v1-5/stable-diffusion-v1-5"
    scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False)
    controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose")
    model = MasaCtrlControlNetPipeline.from_pretrained(model_path, controlnet=controlnet, scheduler=scheduler, cross_attention_kwargs={"scale": 0.5}).to(device)

    # prompt, condition image
    prompts = [f"highly detailed, a {noun}, standing, facing camera, full body portrait, full-length portrait", f"highly detailed, {noun}, standing, facing camera, full body portrait, full-length portrait"]

    # print(f"Source prompt: {prompts[0]}")
    # print(f"Edit prompt: {prompts[1]}")

    condition_image = f"{pose_path}/standing.png"
    # load the condition image
    condition_image = read_image(condition_image).float() / 255.0
    # rgba to rgb conversion
    if condition_image.shape[0] == 4:
        condition_image = condition_image[:3, :, :]
    # resize to 512x512
    condition_image = F.interpolate(condition_image.unsqueeze(0), size=(512, 512), mode='bilinear', align_corners=False)
    condition_image = condition_image.to(device)
    zero_condition = torch.zeros_like(condition_image)
    condition = torch.cat([zero_condition, condition_image], dim=0)  # concatenate the condition image and zero condition

    # generate source image
    editor = AttentionBase()
    regiter_attention_editor_diffusers(model, editor)
    image_ori = model(prompts, controlnet_conditioning=condition, latents=start_code_masa, guidance_scale=7.5)
    
    # save the original image
    for pose in pose_list:
        ToPILImage()(image_ori[0].cpu()).save(f"{source_path}/{noun}_{pose}.png")
    
    for pose in pose_list:
        prompts = [f"highly detailed, a {noun}, standing, facing camera, full body portrait, full-length portrait", f"highly detailed, {noun}, {pose}, facing camera, full body portrait, full-length portrait"]
        # print(f"Source prompt: {prompts[0]}")
        # print(f"Edit prompt: {prompts[1]}")

        condition_image = f"{pose_path}/{pose}.png"
        # load the condition image
        condition_image = read_image(condition_image).float() / 255.0
        # rgba to rgb conversion
        if condition_image.shape[0] == 4:
            condition_image = condition_image[:3, :, :]
        # resize to 512x512
        condition_image = F.interpolate(condition_image.unsqueeze(0), size=(512, 512), mode='bilinear', align_corners=False)
        condition_image = condition_image.to(device)
        zero_condition = torch.zeros_like(condition_image)
        condition = torch.cat([zero_condition, condition_image], dim=0)  # concatenate the condition image and zero condition

        # generate edited image
        editor = MutualSelfAttentionControl(4, 10)
        regiter_attention_editor_diffusers(model, editor)

        # inference the synthesized image
        image_masactrl = model(prompts, controlnet_conditioning=condition, latents=start_code_masa, guidance_scale=7.5)[-1:]
        # Save the edited image
        save_image(image_masactrl, f"{edit_path}/{noun}_{pose}.png")  # with attention hijack



Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.
Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.
Keyword arguments {'cross_attention_kwargs': {'scale': 0.5}} are not expected by MasaCtrlControlNetPipeline and will be ignored.
Loading pipeline components...: 100%|██████████| 7/7 [00:00<00:00, 12.86it/s]
  latent_shape = (batch_size, self.unet.in_channels, height // 8, width // 8)
DDIM Sampler:   0%|          | 0/50 [00:19<?, ?it/s]


KeyboardInterrupt: 

# MasaCtrl

In [6]:
pose_path = "/mnt/hdd/hbchoe/workspace/MasaCtrl/sampling/pose_selected"
source_path = "/mnt/hdd/hbchoe/workspace/MasaCtrl/sampling/masactrl/source"
edit_path = "/mnt/hdd/hbchoe/workspace/MasaCtrl/sampling/masactrl/edit"
# control_image_files = sorted(glob.glob(f"{pose_path}/*.png"))

for noun in noun_list:
    # model initialization
    model_path = "stable-diffusion-v1-5/stable-diffusion-v1-5"
    scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False)
    model = MasaCtrlPipeline.from_pretrained(model_path, scheduler=scheduler, cross_attention_kwargs={"scale": 0.5}).to(device)

    # prompt, condition image
    prompts = [f"highly detailed, {noun}, standing, facing camera, full body portrait, full-length portrait", f"highly detailed, {noun}, standing, facing camera, full body portrait, full-length portrait"]

    # print(f"Source prompt: {prompts[0]}")
    # print(f"Edit prompt: {prompts[1]}")

    # generate source image
    editor = AttentionBase()
    regiter_attention_editor_diffusers(model, editor)
    image_ori = model(prompts, latents=start_code_masa, guidance_scale=7.5)
    
    # save the original image
    for pose in pose_list:
        ToPILImage()(image_ori[0].cpu()).save(f"{source_path}/{noun}_{pose}.png")
    
    for pose in pose_list:
        prompts = [f"highly detailed, {noun}, standing, facing camera, full body portrait, full-length portrait", f"highly detailed, {noun}, {pose}, facing camera, full body portrait, full-length portrait"]
        # print(f"Source prompt: {prompts[0]}")
        # print(f"Edit prompt: {prompts[1]}")

        # generate edited image
        editor = MutualSelfAttentionControl(4, 10)
        regiter_attention_editor_diffusers(model, editor)

        # inference the synthesized image
        image_masactrl = model(prompts, latents=start_code_masa, guidance_scale=7.5)[-1:]
        # Save the edited image
        save_image(image_masactrl, f"{edit_path}/{noun}_{pose}.png")  # with attention hijack


Keyword arguments {'cross_attention_kwargs': {'scale': 0.5}} are not expected by MasaCtrlPipeline and will be ignored.
Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.
  "_class_name": "DDIMScheduler",
  "_diffusers_version": "0.15.0",
  "beta_end": 0.012,
  "beta_schedule": "scaled_linear",
  "beta_start": 0.00085,
  "clip_sample": false,
  "clip_sample_range": 1.0,
  "dynamic_thresholding_ratio": 0.995,
  "num_train_timesteps": 1000,
  "prediction_type": "epsilon",
  "sample_max_value": 1.0,
  "set_alpha_to_one": false,
  "steps_offset": 0,
  "thresholding": false,
  "trained_betas": null
}
 is outdated. `steps_offset` should be set to 1 instead of 0. Please make sure to update the config accordingly as leaving `steps_offset` might led 

Source prompt: highly detailed, boy, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, boy, standing, facing camera, full body portrait, full-length portrait


  deprecate(


input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:20<00:00,  2.47it/s]


Source prompt: highly detailed, boy, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, boy, dancing, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:23<00:00,  2.14it/s]


Source prompt: highly detailed, boy, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, boy, flexing, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:25<00:00,  1.98it/s]


Source prompt: highly detailed, boy, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, boy, jumping, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:26<00:00,  1.90it/s]


Source prompt: highly detailed, boy, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, boy, laying, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.85it/s]


Source prompt: highly detailed, boy, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, boy, tposing, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.82it/s]
Keyword arguments {'cross_attention_kwargs': {'scale': 0.5}} are not expected by MasaCtrlPipeline and will be ignored.
Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.


Source prompt: highly detailed, girl, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, girl, standing, facing camera, full body portrait, full-length portrait
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:22<00:00,  2.21it/s]


Source prompt: highly detailed, girl, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, girl, dancing, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.83it/s]


Source prompt: highly detailed, girl, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, girl, flexing, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.81it/s]


Source prompt: highly detailed, girl, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, girl, jumping, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.80it/s]


Source prompt: highly detailed, girl, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, girl, laying, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.80it/s]


Source prompt: highly detailed, girl, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, girl, tposing, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.80it/s]
Keyword arguments {'cross_attention_kwargs': {'scale': 0.5}} are not expected by MasaCtrlPipeline and will be ignored.
Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.


Source prompt: highly detailed, man, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, man, standing, facing camera, full body portrait, full-length portrait
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:22<00:00,  2.19it/s]


Source prompt: highly detailed, man, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, man, dancing, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.82it/s]


Source prompt: highly detailed, man, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, man, flexing, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.83it/s]


Source prompt: highly detailed, man, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, man, jumping, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.81it/s]


Source prompt: highly detailed, man, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, man, laying, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.81it/s]


Source prompt: highly detailed, man, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, man, tposing, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.81it/s]
Keyword arguments {'cross_attention_kwargs': {'scale': 0.5}} are not expected by MasaCtrlPipeline and will be ignored.
Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.


Source prompt: highly detailed, woman, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, woman, standing, facing camera, full body portrait, full-length portrait
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:22<00:00,  2.19it/s]


Source prompt: highly detailed, woman, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, woman, dancing, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.84it/s]


Source prompt: highly detailed, woman, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, woman, flexing, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.79it/s]


Source prompt: highly detailed, woman, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, woman, jumping, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.80it/s]


Source prompt: highly detailed, woman, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, woman, laying, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.80it/s]


Source prompt: highly detailed, woman, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, woman, tposing, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.80it/s]
Keyword arguments {'cross_attention_kwargs': {'scale': 0.5}} are not expected by MasaCtrlPipeline and will be ignored.
Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.


Source prompt: highly detailed, child, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, child, standing, facing camera, full body portrait, full-length portrait
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:22<00:00,  2.20it/s]


Source prompt: highly detailed, child, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, child, dancing, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.82it/s]


Source prompt: highly detailed, child, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, child, flexing, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.82it/s]


Source prompt: highly detailed, child, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, child, jumping, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.81it/s]


Source prompt: highly detailed, child, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, child, laying, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.82it/s]


Source prompt: highly detailed, child, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, child, tposing, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.81it/s]
Keyword arguments {'cross_attention_kwargs': {'scale': 0.5}} are not expected by MasaCtrlPipeline and will be ignored.
Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.


Source prompt: highly detailed, farmer, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, farmer, standing, facing camera, full body portrait, full-length portrait
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:22<00:00,  2.20it/s]


Source prompt: highly detailed, farmer, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, farmer, dancing, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.83it/s]


Source prompt: highly detailed, farmer, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, farmer, flexing, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.81it/s]


Source prompt: highly detailed, farmer, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, farmer, jumping, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.80it/s]


Source prompt: highly detailed, farmer, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, farmer, laying, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.80it/s]


Source prompt: highly detailed, farmer, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, farmer, tposing, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.80it/s]
Keyword arguments {'cross_attention_kwargs': {'scale': 0.5}} are not expected by MasaCtrlPipeline and will be ignored.
Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.


Source prompt: highly detailed, soldier, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, soldier, standing, facing camera, full body portrait, full-length portrait
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:22<00:00,  2.19it/s]


Source prompt: highly detailed, soldier, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, soldier, dancing, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.83it/s]


Source prompt: highly detailed, soldier, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, soldier, flexing, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.81it/s]


Source prompt: highly detailed, soldier, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, soldier, jumping, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.81it/s]


Source prompt: highly detailed, soldier, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, soldier, laying, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.80it/s]


Source prompt: highly detailed, soldier, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, soldier, tposing, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.81it/s]
Keyword arguments {'cross_attention_kwargs': {'scale': 0.5}} are not expected by MasaCtrlPipeline and will be ignored.
Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.


Source prompt: highly detailed, firefighter, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, firefighter, standing, facing camera, full body portrait, full-length portrait
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:22<00:00,  2.20it/s]


Source prompt: highly detailed, firefighter, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, firefighter, dancing, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.84it/s]


Source prompt: highly detailed, firefighter, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, firefighter, flexing, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.79it/s]


Source prompt: highly detailed, firefighter, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, firefighter, jumping, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.80it/s]


Source prompt: highly detailed, firefighter, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, firefighter, laying, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.81it/s]


Source prompt: highly detailed, firefighter, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, firefighter, tposing, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.79it/s]
Keyword arguments {'cross_attention_kwargs': {'scale': 0.5}} are not expected by MasaCtrlPipeline and will be ignored.
Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.


Source prompt: highly detailed, pirate, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, pirate, standing, facing camera, full body portrait, full-length portrait
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:22<00:00,  2.20it/s]


Source prompt: highly detailed, pirate, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, pirate, dancing, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.82it/s]


Source prompt: highly detailed, pirate, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, pirate, flexing, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.81it/s]


Source prompt: highly detailed, pirate, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, pirate, jumping, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.80it/s]


Source prompt: highly detailed, pirate, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, pirate, laying, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.80it/s]


Source prompt: highly detailed, pirate, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, pirate, tposing, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.80it/s]
Keyword arguments {'cross_attention_kwargs': {'scale': 0.5}} are not expected by MasaCtrlPipeline and will be ignored.
Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.


Source prompt: highly detailed, basketball player, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, basketball player, standing, facing camera, full body portrait, full-length portrait
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:22<00:00,  2.20it/s]


Source prompt: highly detailed, basketball player, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, basketball player, dancing, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.84it/s]


Source prompt: highly detailed, basketball player, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, basketball player, flexing, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.81it/s]


Source prompt: highly detailed, basketball player, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, basketball player, jumping, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.81it/s]


Source prompt: highly detailed, basketball player, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, basketball player, laying, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.82it/s]


Source prompt: highly detailed, basketball player, standing, facing camera, full body portrait, full-length portrait
Edit prompt: highly detailed, basketball player, tposing, facing camera, full body portrait, full-length portrait
MasaCtrl at denoising steps:  [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]
MasaCtrl at U-Net layers:  [10, 11, 12, 13, 14, 15]
input text embeddings : torch.Size([2, 77, 768])
latents shape:  torch.Size([2, 4, 64, 64])


DDIM Sampler: 100%|██████████| 50/50 [00:27<00:00,  1.81it/s]


# Fixed Seed

In [7]:
pose_path = "/mnt/hdd/hbchoe/workspace/MasaCtrl/sampling/pose_selected"
source_path = "/mnt/hdd/hbchoe/workspace/MasaCtrl/sampling/fixed_seed/source"
edit_path = "/mnt/hdd/hbchoe/workspace/MasaCtrl/sampling/fixed_seed/edit"

# model initialization
model_path = "stable-diffusion-v1-5/stable-diffusion-v1-5"
scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False)
model = StableDiffusionPipeline.from_pretrained(model_path, scheduler=scheduler, safety_checker=None, cross_attention_kwargs={"scale": 0.5}).to(device)

for noun in noun_list:
    # prompt, condition image
    prompts = f"highly detailed, {noun}, standing, facing camera, full body portrait, full-length portrait"

    # print(f"Source prompt: {prompts}")

    image_ori = model(prompts, latents=start_code, guidance_scale=7.5)[0]
    
    # save the original image
    for pose in pose_list:
        image_ori[0].save(f"{source_path}/{noun}_{pose}.png")
    
    for pose in pose_list:
        prompts = f"highly detailed, {noun}, {pose}, facing camera, full body portrait, full-length portrait"
        # print(f"Edit prompt: {prompts}")

        # edited prompt
        image_edit = model(prompts, latents=start_code, guidance_scale=7.5)[0]
        # Save the edited image
        image_edit[0].save(f"{edit_path}/{noun}_{pose}.png")  # Fix applied here


Keyword arguments {'cross_attention_kwargs': {'scale': 0.5}} are not expected by StableDiffusionPipeline and will be ignored.
Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.
You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling it only for use-cases that involve analyzing network behavior or auditing its results. For more informat

Source prompt: highly detailed, boy, standing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:09<00:00,  5.20it/s]


Edit prompt: highly detailed, boy, dancing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:09<00:00,  5.27it/s]


Edit prompt: highly detailed, boy, flexing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:10<00:00,  4.97it/s]


Edit prompt: highly detailed, boy, jumping, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:10<00:00,  4.79it/s]


Edit prompt: highly detailed, boy, laying, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:10<00:00,  4.63it/s]


Edit prompt: highly detailed, boy, tposing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:10<00:00,  4.55it/s]


Source prompt: highly detailed, girl, standing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.48it/s]


Edit prompt: highly detailed, girl, dancing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.50it/s]


Edit prompt: highly detailed, girl, flexing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.44it/s]


Edit prompt: highly detailed, girl, jumping, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.37it/s]


Edit prompt: highly detailed, girl, laying, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.34it/s]


Edit prompt: highly detailed, girl, tposing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.29it/s]


Source prompt: highly detailed, man, standing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.36it/s]


Edit prompt: highly detailed, man, dancing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.40it/s]


Edit prompt: highly detailed, man, flexing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.32it/s]


Edit prompt: highly detailed, man, jumping, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.36it/s]


Edit prompt: highly detailed, man, laying, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.37it/s]


Edit prompt: highly detailed, man, tposing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.36it/s]


Source prompt: highly detailed, woman, standing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.32it/s]


Edit prompt: highly detailed, woman, dancing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.32it/s]


Edit prompt: highly detailed, woman, flexing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.36it/s]


Edit prompt: highly detailed, woman, jumping, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.29it/s]


Edit prompt: highly detailed, woman, laying, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.36it/s]


Edit prompt: highly detailed, woman, tposing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.34it/s]


Source prompt: highly detailed, child, standing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.34it/s]


Edit prompt: highly detailed, child, dancing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.30it/s]


Edit prompt: highly detailed, child, flexing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.37it/s]


Edit prompt: highly detailed, child, jumping, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.26it/s]


Edit prompt: highly detailed, child, laying, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.33it/s]


Edit prompt: highly detailed, child, tposing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.31it/s]


Source prompt: highly detailed, farmer, standing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.31it/s]


Edit prompt: highly detailed, farmer, dancing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.24it/s]


Edit prompt: highly detailed, farmer, flexing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.32it/s]


Edit prompt: highly detailed, farmer, jumping, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.30it/s]


Edit prompt: highly detailed, farmer, laying, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.25it/s]


Edit prompt: highly detailed, farmer, tposing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.27it/s]


Source prompt: highly detailed, soldier, standing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.27it/s]


Edit prompt: highly detailed, soldier, dancing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.31it/s]


Edit prompt: highly detailed, soldier, flexing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.31it/s]


Edit prompt: highly detailed, soldier, jumping, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.31it/s]


Edit prompt: highly detailed, soldier, laying, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.26it/s]


Edit prompt: highly detailed, soldier, tposing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.21it/s]


Source prompt: highly detailed, firefighter, standing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.25it/s]


Edit prompt: highly detailed, firefighter, dancing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.30it/s]


Edit prompt: highly detailed, firefighter, flexing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.25it/s]


Edit prompt: highly detailed, firefighter, jumping, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.33it/s]


Edit prompt: highly detailed, firefighter, laying, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.26it/s]


Edit prompt: highly detailed, firefighter, tposing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.30it/s]


Source prompt: highly detailed, pirate, standing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.30it/s]


Edit prompt: highly detailed, pirate, dancing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.31it/s]


Edit prompt: highly detailed, pirate, flexing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.32it/s]


Edit prompt: highly detailed, pirate, jumping, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.27it/s]


Edit prompt: highly detailed, pirate, laying, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.29it/s]


Edit prompt: highly detailed, pirate, tposing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.29it/s]


Source prompt: highly detailed, basketball player, standing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.26it/s]


Edit prompt: highly detailed, basketball player, dancing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.34it/s]


Edit prompt: highly detailed, basketball player, flexing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.26it/s]


Edit prompt: highly detailed, basketball player, jumping, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.25it/s]


Edit prompt: highly detailed, basketball player, laying, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.25it/s]


Edit prompt: highly detailed, basketball player, tposing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:11<00:00,  4.26it/s]


# ControlNet

In [8]:
pose_path = "/mnt/hdd/hbchoe/workspace/MasaCtrl/sampling/pose_selected"
source_path = "/mnt/hdd/hbchoe/workspace/MasaCtrl/sampling/controlnet/source"
edit_path = "/mnt/hdd/hbchoe/workspace/MasaCtrl/sampling/controlnet/edit"

# model initialization
model_path = "stable-diffusion-v1-5/stable-diffusion-v1-5"
scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False)
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose")
model = StableDiffusionControlNetPipeline.from_pretrained(model_path, controlnet=controlnet, scheduler=scheduler, safety_checker=None, cross_attention_kwargs={"scale": 0.5}).to(device)

for noun in noun_list:
    # prompt, condition image
    prompts = f"highly detailed, {noun}, standing, facing camera, full body portrait, full-length portrait"

    # print(f"Source prompt: {prompts}")

    # condition image
    condition_image = f"{pose_path}/standing.png"
    # load the condition image
    condition_image = read_image(condition_image).float() / 255.0
    # rgba to rgb conversion
    if condition_image.shape[0] == 4:
        condition_image = condition_image[:3, :, :]
    # resize to 512x512
    condition_image = F.interpolate(condition_image.unsqueeze(0), size=(512, 512), mode='bilinear', align_corners=False)
    condition = condition_image.to(device)


    image_ori = model(prompts, image=condition, latents=start_code, guidance_scale=7.5)[0]
    
    # save the original image
    for pose in pose_list:
        image_ori[0].save(f"{source_path}/{noun}_{pose}.png")
    
    for pose in pose_list:
        prompts = f"highly detailed, {noun}, {pose}, facing camera, full body portrait, full-length portrait"
        # print(f"Edit prompt: {prompts}")

        condition_image = f"{pose_path}/{pose}.png"
        # load the condition image
        condition_image = read_image(condition_image).float() / 255.0
        # rgba to rgb conversion
        if condition_image.shape[0] == 4:
            condition_image = condition_image[:3, :, :]
        # resize to 512x512
        condition_image = F.interpolate(condition_image.unsqueeze(0), size=(512, 512), mode='bilinear', align_corners=False)
        condition = condition_image.to(device)

        # edited prompt
        image_edit = model(prompts, image=condition, latents=start_code, guidance_scale=7.5)[0]
        # Save the edited image
        image_edit[0].save(f"{edit_path}/{noun}_{pose}.png")


Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.
Keyword arguments {'cross_attention_kwargs': {'scale': 0.5}} are not expected by StableDiffusionControlNetPipeline and will be ignored.
Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.
You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_controlnet.StableDiffusionControlNetPipeline'> by passing `safety_checker=None`. Ensure that you abide to the conditions of the Stable Diffusion licen

Source prompt: highly detailed, boy, standing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:14<00:00,  3.45it/s]


Edit prompt: highly detailed, boy, dancing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:15<00:00,  3.17it/s]


Edit prompt: highly detailed, boy, flexing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.09it/s]


Edit prompt: highly detailed, boy, jumping, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.09it/s]


Edit prompt: highly detailed, boy, laying, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.03it/s]


Edit prompt: highly detailed, boy, tposing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.05it/s]


Source prompt: highly detailed, girl, standing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.04it/s]


Edit prompt: highly detailed, girl, dancing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.03it/s]


Edit prompt: highly detailed, girl, flexing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.05it/s]


Edit prompt: highly detailed, girl, jumping, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.03it/s]


Edit prompt: highly detailed, girl, laying, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.01it/s]


Edit prompt: highly detailed, girl, tposing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.02it/s]


Source prompt: highly detailed, man, standing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.00it/s]


Edit prompt: highly detailed, man, dancing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.06it/s]


Edit prompt: highly detailed, man, flexing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.03it/s]


Edit prompt: highly detailed, man, jumping, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.00it/s]


Edit prompt: highly detailed, man, laying, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.00it/s]


Edit prompt: highly detailed, man, tposing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.00it/s]


Source prompt: highly detailed, woman, standing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.02it/s]


Edit prompt: highly detailed, woman, dancing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.05it/s]


Edit prompt: highly detailed, woman, flexing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.03it/s]


Edit prompt: highly detailed, woman, jumping, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.03it/s]


Edit prompt: highly detailed, woman, laying, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.03it/s]


Edit prompt: highly detailed, woman, tposing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.03it/s]


Source prompt: highly detailed, child, standing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.03it/s]


Edit prompt: highly detailed, child, dancing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.02it/s]


Edit prompt: highly detailed, child, flexing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.01it/s]


Edit prompt: highly detailed, child, jumping, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.02it/s]


Edit prompt: highly detailed, child, laying, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.03it/s]


Edit prompt: highly detailed, child, tposing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  2.98it/s]


Source prompt: highly detailed, farmer, standing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.01it/s]


Edit prompt: highly detailed, farmer, dancing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.02it/s]


Edit prompt: highly detailed, farmer, flexing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.01it/s]


Edit prompt: highly detailed, farmer, jumping, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  2.98it/s]


Edit prompt: highly detailed, farmer, laying, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  2.98it/s]


Edit prompt: highly detailed, farmer, tposing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.01it/s]


Source prompt: highly detailed, soldier, standing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.01it/s]


Edit prompt: highly detailed, soldier, dancing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.01it/s]


Edit prompt: highly detailed, soldier, flexing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.03it/s]


Edit prompt: highly detailed, soldier, jumping, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.02it/s]


Edit prompt: highly detailed, soldier, laying, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.02it/s]


Edit prompt: highly detailed, soldier, tposing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.01it/s]


Source prompt: highly detailed, firefighter, standing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.03it/s]


Edit prompt: highly detailed, firefighter, dancing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.01it/s]


Edit prompt: highly detailed, firefighter, flexing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.03it/s]


Edit prompt: highly detailed, firefighter, jumping, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.04it/s]


Edit prompt: highly detailed, firefighter, laying, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.03it/s]


Edit prompt: highly detailed, firefighter, tposing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  2.97it/s]


Source prompt: highly detailed, pirate, standing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.00it/s]


Edit prompt: highly detailed, pirate, dancing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.03it/s]


Edit prompt: highly detailed, pirate, flexing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  2.99it/s]


Edit prompt: highly detailed, pirate, jumping, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.03it/s]


Edit prompt: highly detailed, pirate, laying, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.03it/s]


Edit prompt: highly detailed, pirate, tposing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.03it/s]


Source prompt: highly detailed, basketball player, standing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.03it/s]


Edit prompt: highly detailed, basketball player, dancing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.05it/s]


Edit prompt: highly detailed, basketball player, flexing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.04it/s]


Edit prompt: highly detailed, basketball player, jumping, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  2.98it/s]


Edit prompt: highly detailed, basketball player, laying, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.06it/s]


Edit prompt: highly detailed, basketball player, tposing, facing camera, full body portrait, full-length portrait


100%|██████████| 50/50 [00:16<00:00,  3.04it/s]
