# Experiment Sample Generation
This file is responsible for generating samples used in the experiments for the report. We set `noun_list` and `pose_list` to make sample dataset.
Then we sample 50 samples from each baselines. The results will be saved in `./samples`.
Prompt-to-Prompt sampling is handled separately in `sampling_p2p.ipynb` because it requires a different environment setup.


# Setting

In [None]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np

from tqdm import tqdm
from einops import rearrange, repeat
from omegaconf import OmegaConf

from diffusers import DDIMScheduler, ControlNetModel, StableDiffusionPipeline, StableDiffusionControlNetPipeline

from MasaCtrl.masactrl.diffuser_utils import MasaCtrlPipeline, MasaCtrlControlNetPipeline
from MasaCtrl.masactrl.masactrl_utils import AttentionBase
from MasaCtrl.masactrl.masactrl_utils import regiter_attention_editor_diffusers
from MasaCtrl.masactrl.masactrl import MutualSelfAttentionControl

from torchvision.utils import save_image
from torchvision.io import read_image
from pytorch_lightning import seed_everything

from torchvision.transforms import ToPILImage

import glob

torch.cuda.set_device(0)  # set the GPU device

In [None]:
import sys
import random
import datetime as dt

import numpy as np
import torch

seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True


In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

start_code = torch.randn([1, 4, 64, 64], device=device)
start_code_masa = start_code.expand(2, -1, -1, -1) # expand to batch size 2

In [None]:
noun_list = ["boy", "girl", "man", "woman", "child", 
 "farmer", "soldier", "firefighter", "pirate", "basketball player"]
pose_list = ["dancing", "flexing", 'jumping', 'laying', 'tposing']

In [None]:
baselines = ["controlnet", "fixed_seed", "masactrl", "masactrl_controlnet", 'test']
for baseline in baselines:
    if not os.path.exists(f"sampling/{baseline}"):
        os.mkdir(f"sampling/{baseline}")
        os.mkdir(f"sampling/{baseline}/source")
        os.mkdir(f"sampling/{baseline}/edit")

# MasaCtrl+ControlNet

In [None]:
pose_path = "sampling/pose_selected"
source_path = "sampling/masactrl_controlnet/source"
edit_path = "sampling/masactrl_controlnet/edit"

for noun in noun_list:
    # model initialization
    model_path = "stable-diffusion-v1-5/stable-diffusion-v1-5"
    scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False)
    controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose")
    model = MasaCtrlControlNetPipeline.from_pretrained(model_path, controlnet=controlnet, scheduler=scheduler, cross_attention_kwargs={"scale": 0.5}).to(device)

    # prompt, condition image
    prompts = [f"highly detailed, a {noun}, standing, facing camera, full body portrait, full-length portrait", f"highly detailed, {noun}, standing, facing camera, full body portrait, full-length portrait"]

    # print(f"Source prompt: {prompts[0]}")
    # print(f"Edit prompt: {prompts[1]}")

    condition_image = f"{pose_path}/standing.png"
    # load the condition image
    condition_image = read_image(condition_image).float() / 255.0
    # rgba to rgb conversion
    if condition_image.shape[0] == 4:
        condition_image = condition_image[:3, :, :]
    # resize to 512x512
    condition_image = F.interpolate(condition_image.unsqueeze(0), size=(512, 512), mode='bilinear', align_corners=False)
    condition_image = condition_image.to(device)
    zero_condition = torch.zeros_like(condition_image)
    condition = torch.cat([zero_condition, condition_image], dim=0)  # concatenate the condition image and zero condition

    # generate source image
    editor = AttentionBase()
    regiter_attention_editor_diffusers(model, editor)
    image_ori = model(prompts, controlnet_conditioning=condition, latents=start_code_masa, guidance_scale=7.5)
    
    # save the original image
    for pose in pose_list:
        ToPILImage()(image_ori[0].cpu()).save(f"{source_path}/{noun}_{pose}.png")
    
    for pose in pose_list:
        prompts = [f"highly detailed, a {noun}, standing, facing camera, full body portrait, full-length portrait", f"highly detailed, {noun}, {pose}, facing camera, full body portrait, full-length portrait"]
        # print(f"Source prompt: {prompts[0]}")
        # print(f"Edit prompt: {prompts[1]}")

        condition_image = f"{pose_path}/{pose}.png"
        # load the condition image
        condition_image = read_image(condition_image).float() / 255.0
        # rgba to rgb conversion
        if condition_image.shape[0] == 4:
            condition_image = condition_image[:3, :, :]
        # resize to 512x512
        condition_image = F.interpolate(condition_image.unsqueeze(0), size=(512, 512), mode='bilinear', align_corners=False)
        condition_image = condition_image.to(device)
        zero_condition = torch.zeros_like(condition_image)
        condition = torch.cat([zero_condition, condition_image], dim=0)  # concatenate the condition image and zero condition

        # generate edited image
        editor = MutualSelfAttentionControl(4, 10)
        regiter_attention_editor_diffusers(model, editor)

        # inference the synthesized image
        image_masactrl = model(prompts, controlnet_conditioning=condition, latents=start_code_masa, guidance_scale=7.5)[-1:]
        # Save the edited image
        save_image(image_masactrl, f"{edit_path}/{noun}_{pose}.png")  # with attention hijack



# MasaCtrl

In [None]:
pose_path = "/mnt/hdd/hbchoe/workspace/MasaCtrl/sampling/pose_selected"
source_path = "/mnt/hdd/hbchoe/workspace/MasaCtrl/sampling/masactrl/source"
edit_path = "/mnt/hdd/hbchoe/workspace/MasaCtrl/sampling/masactrl/edit"
# control_image_files = sorted(glob.glob(f"{pose_path}/*.png"))

for noun in noun_list:
    # model initialization
    model_path = "stable-diffusion-v1-5/stable-diffusion-v1-5"
    scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False)
    model = MasaCtrlPipeline.from_pretrained(model_path, scheduler=scheduler, cross_attention_kwargs={"scale": 0.5}).to(device)

    # prompt, condition image
    prompts = [f"highly detailed, {noun}, standing, facing camera, full body portrait, full-length portrait", f"highly detailed, {noun}, standing, facing camera, full body portrait, full-length portrait"]

    # print(f"Source prompt: {prompts[0]}")
    # print(f"Edit prompt: {prompts[1]}")

    # generate source image
    editor = AttentionBase()
    regiter_attention_editor_diffusers(model, editor)
    image_ori = model(prompts, latents=start_code_masa, guidance_scale=7.5)
    
    # save the original image
    for pose in pose_list:
        ToPILImage()(image_ori[0].cpu()).save(f"{source_path}/{noun}_{pose}.png")
    
    for pose in pose_list:
        prompts = [f"highly detailed, {noun}, standing, facing camera, full body portrait, full-length portrait", f"highly detailed, {noun}, {pose}, facing camera, full body portrait, full-length portrait"]
        # print(f"Source prompt: {prompts[0]}")
        # print(f"Edit prompt: {prompts[1]}")

        # generate edited image
        editor = MutualSelfAttentionControl(4, 10)
        regiter_attention_editor_diffusers(model, editor)

        # inference the synthesized image
        image_masactrl = model(prompts, latents=start_code_masa, guidance_scale=7.5)[-1:]
        # Save the edited image
        save_image(image_masactrl, f"{edit_path}/{noun}_{pose}.png")  # with attention hijack


# Fixed Seed

In [None]:
pose_path = "/mnt/hdd/hbchoe/workspace/MasaCtrl/sampling/pose_selected"
source_path = "/mnt/hdd/hbchoe/workspace/MasaCtrl/sampling/fixed_seed/source"
edit_path = "/mnt/hdd/hbchoe/workspace/MasaCtrl/sampling/fixed_seed/edit"

# model initialization
model_path = "stable-diffusion-v1-5/stable-diffusion-v1-5"
scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False)
model = StableDiffusionPipeline.from_pretrained(model_path, scheduler=scheduler, safety_checker=None, cross_attention_kwargs={"scale": 0.5}).to(device)

for noun in noun_list:
    # prompt, condition image
    prompts = f"highly detailed, {noun}, standing, facing camera, full body portrait, full-length portrait"

    # print(f"Source prompt: {prompts}")

    image_ori = model(prompts, latents=start_code, guidance_scale=7.5)[0]
    
    # save the original image
    for pose in pose_list:
        image_ori[0].save(f"{source_path}/{noun}_{pose}.png")
    
    for pose in pose_list:
        prompts = f"highly detailed, {noun}, {pose}, facing camera, full body portrait, full-length portrait"
        # print(f"Edit prompt: {prompts}")

        # edited prompt
        image_edit = model(prompts, latents=start_code, guidance_scale=7.5)[0]
        # Save the edited image
        image_edit[0].save(f"{edit_path}/{noun}_{pose}.png")  # Fix applied here


# ControlNet

In [None]:
pose_path = "/mnt/hdd/hbchoe/workspace/MasaCtrl/sampling/pose_selected"
source_path = "/mnt/hdd/hbchoe/workspace/MasaCtrl/sampling/controlnet/source"
edit_path = "/mnt/hdd/hbchoe/workspace/MasaCtrl/sampling/controlnet/edit"

# model initialization
model_path = "stable-diffusion-v1-5/stable-diffusion-v1-5"
scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False)
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose")
model = StableDiffusionControlNetPipeline.from_pretrained(model_path, controlnet=controlnet, scheduler=scheduler, safety_checker=None, cross_attention_kwargs={"scale": 0.5}).to(device)

for noun in noun_list:
    # prompt, condition image
    prompts = f"highly detailed, {noun}, standing, facing camera, full body portrait, full-length portrait"

    # print(f"Source prompt: {prompts}")

    # condition image
    condition_image = f"{pose_path}/standing.png"
    # load the condition image
    condition_image = read_image(condition_image).float() / 255.0
    # rgba to rgb conversion
    if condition_image.shape[0] == 4:
        condition_image = condition_image[:3, :, :]
    # resize to 512x512
    condition_image = F.interpolate(condition_image.unsqueeze(0), size=(512, 512), mode='bilinear', align_corners=False)
    condition = condition_image.to(device)


    image_ori = model(prompts, image=condition, latents=start_code, guidance_scale=7.5)[0]
    
    # save the original image
    for pose in pose_list:
        image_ori[0].save(f"{source_path}/{noun}_{pose}.png")
    
    for pose in pose_list:
        prompts = f"highly detailed, {noun}, {pose}, facing camera, full body portrait, full-length portrait"
        # print(f"Edit prompt: {prompts}")

        condition_image = f"{pose_path}/{pose}.png"
        # load the condition image
        condition_image = read_image(condition_image).float() / 255.0
        # rgba to rgb conversion
        if condition_image.shape[0] == 4:
            condition_image = condition_image[:3, :, :]
        # resize to 512x512
        condition_image = F.interpolate(condition_image.unsqueeze(0), size=(512, 512), mode='bilinear', align_corners=False)
        condition = condition_image.to(device)

        # edited prompt
        image_edit = model(prompts, image=condition, latents=start_code, guidance_scale=7.5)[0]
        # Save the edited image
        image_edit[0].save(f"{edit_path}/{noun}_{pose}.png")
