In [None]:
import torch
import numpy as np
import cv2
import diffusers

from PIL import Image

from diffusers import (
    StableDiffusionPipeline, StableDiffusionImg2ImgPipeline,
    StableDiffusionInpaintPipeline, StableDiffusionControlNetInpaintPipeline,
    ControlNetModel
)
from diffusers.utils import load_image, make_image_grid


try:
    import xformers
    xformers_loaded = True
except ImportError:
    xformers_loaded = False

print(f'{xformers_loaded=}')

In [None]:
device = torch.device('cuda:0')

### Text2Img genereration pipeline

In [None]:
pipe_gen = StableDiffusionPipeline.from_pretrained(
    "SG161222/Realistic_Vision_V5.1_noVAE",
    torch_dtype=torch.float16,
    use_safetensors=True,
).to(device)

if xformers_loaded:
    pipe_gen.enable_xformers_memory_efficient_attention()

In [None]:
generator = torch.Generator()
generator.manual_seed(42)

output_images = pipe_gen(
    prompt='woman in black dress, high quality, detailed, 4k',
    negative_prompt='monochrome, lowres, bad anatomy, worst quality, low quality',
    height=512, width=512,
    num_inference_steps=20, guidance_scale=7.5,
    num_images_per_prompt=4,
    generator=generator
)

In [None]:
make_image_grid(output_images.images, rows=1, cols=4, resize=384)

### SDEdit - img2img translation

In [None]:
pipe_img2img = StableDiffusionImg2ImgPipeline(
    scheduler=pipe_gen.scheduler,
    text_encoder=pipe_gen.text_encoder,
    tokenizer=pipe_gen.tokenizer,
    unet=pipe_gen.unet,
    vae=pipe_gen.vae,
    safety_checker=pipe_gen.safety_checker,
    feature_extractor=pipe_gen.feature_extractor
).to(device)

if xformers_loaded:
    pipe_img2img.enable_xformers_memory_efficient_attention()

In [None]:
generator.manual_seed(1)

strengths = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
output_images_img2img = []

for s in strengths:
    out = pipe_img2img(
        image=output_images.images[1],
        strength=s,
        prompt='woman in green dress, high quality, detailed, 4k',
        negative_prompt='monochrome, lowres, bad anatomy, worst quality, low quality',
        num_inference_steps=20, guidance_scale=7.5,
        num_images_per_prompt=1,
        generator=generator
    )
    output_images_img2img.append(out.images[0])

In [None]:
make_image_grid(output_images_img2img, rows=2, cols=4, resize=384)

### Inpainting

In [None]:
pipe_inpainting = StableDiffusionInpaintPipeline.from_pretrained(
    "Uminosachi/realisticVisionV51_v51VAE-inpainting",
    torch_dtype=torch.float16,
    use_safetensors=True,
).to(device)

if xformers_loaded:
    pipe_inpainting.enable_xformers_memory_efficient_attention()

In [None]:
image = Image.open('./data/red_dress_image.jpg')
mask = Image.open('./data/red_dress_mask.png')

# extract dress mask
mask = np.array(mask)
mask = (mask == 6).astype(np.uint8) * 255
# optional dilation
mask = cv2.dilate(mask, kernel=np.ones((5, 5)))
mask = Image.fromarray(mask)

make_image_grid([image, mask], rows=1, cols=2, resize=384)

In [None]:
generator.manual_seed(2)

out = pipe_inpainting(
    image=image,
    mask_image=mask,
    strength=1.,
    prompt='woman in green dress, high quality, detailed, 4k',
    negative_prompt='monochrome, lowres, bad anatomy, worst quality, low quality',
    num_inference_steps=20, guidance_scale=7.5,
    num_images_per_prompt=4,
    generator=generator
)

In [None]:
make_image_grid(out.images, rows=1, cols=4, resize=384)

### Inpainting with ControlNet for edges

In [None]:
controlnet = ControlNetModel.from_pretrained(
    "lllyasviel/sd-controlnet-canny", 
    torch_dtype=torch.float16, 
    use_safetensors=True)

pipe_inpainting_controlnet = StableDiffusionControlNetInpaintPipeline(
    scheduler=pipe_inpainting.scheduler,
    text_encoder=pipe_inpainting.text_encoder,
    tokenizer=pipe_inpainting.tokenizer,
    unet=pipe_inpainting.unet,
    vae=pipe_inpainting.vae,
    safety_checker=pipe_inpainting.safety_checker,
    feature_extractor=pipe_inpainting.feature_extractor,
    controlnet=controlnet
).to(device)

if xformers_loaded:
    pipe_inpainting_controlnet.enable_xformers_memory_efficient_attention()

In [None]:
control_image = np.rot90(np.array(Image.open('./data/pattern.jpeg')))

control_image_edge = cv2.Canny(control_image, 100, 200)
control_image_edge = control_image_edge[:, :, None]
control_image_edge = np.concatenate([control_image_edge, control_image_edge, control_image_edge], axis=2)

control_image = Image.fromarray(control_image).resize(image.size)
control_image_edge = Image.fromarray(control_image_edge).resize(image.size)

make_image_grid([control_image, control_image_edge], rows=1, cols=2, resize=384)

In [None]:
generator.manual_seed(3)

out = pipe_inpainting_controlnet(
    image=image,
    mask_image=mask,
    control_image=control_image_edge,
    strength=1.,
    prompt='woman in dark dress, high quality, detailed, 4k',
    negative_prompt='monochrome, lowres, bad anatomy, worst quality, low quality',
    num_inference_steps=20, guidance_scale=7.5,
    num_images_per_prompt=4,
    generator=generator
)

In [None]:
make_image_grid(out.images, rows=1, cols=4, resize=384)

### IP-adapter for Image prompting

In [None]:
pipe_inpainting.load_ip_adapter(
    "h94/IP-Adapter",
    subfolder="models", 
    weight_name="ip-adapter_sd15.safetensors")
pipe_inpainting.set_ip_adapter_scale(0.7)

In [None]:
generator.manual_seed(4)

ip_adapter_image = Image.open('./data/casual.jpeg')

out = pipe_inpainting(
    image=image,
    mask_image=mask,
    strength=1.,
    ip_adapter_image=ip_adapter_image,
    prompt='woman, high quality, detailed, 4k',
    negative_prompt='monochrome, lowres, bad anatomy, worst quality, low quality',
    num_inference_steps=20, guidance_scale=7.5,
    num_images_per_prompt=4,
    generator=generator
)

In [None]:
make_image_grid([ip_adapter_image] + out.images, rows=1, cols=5, resize=384)