In [None]:
import os
import warnings

warnings.filterwarnings("ignore")

import yaml

from tqdm.auto import tqdm

import torch
from diffusers import UNet2DConditionModel, AutoencoderKL, LMSDiscreteScheduler
from transformers import CLIPTextModel, CLIPTokenizer, CLIPProcessor, CLIPModel

from torchvision import transforms

from PIL import Image

torch.set_grad_enabled(False)

#
name = "default"

with open(f"data/{name}.yaml", 'r', encoding='utf-8') as file:
    config = yaml.safe_load(file)

version = config["version"]
prev_prompt = config["prev_prompt"]
retain_prompt = config["retain_prompt"]

seed = config["seed"]
prompt_count = config["prompt_count"]
sample_count = config["sample_count"]

prompt = prev_prompt + retain_prompt
prompt = prompt[:prompt_count]

#
device = "cuda:0" if torch.cuda.is_available() else "cpu"
unet = UNet2DConditionModel.from_pretrained(version, torch_dtype=torch.float16, subfolder="unet").to(device)
unet.load_state_dict(torch.load(f"model/{name}.pth"))
tokenizer = CLIPTokenizer.from_pretrained(version, torch_dtype=torch.float16, subfolder="tokenizer")
text_encoder = CLIPTextModel.from_pretrained(version, torch_dtype=torch.float16, subfolder="text_encoder").to(device)
vae = AutoencoderKL.from_pretrained(version, torch_dtype=torch.float16, subfolder="vae").to(device)

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

torch.cuda.empty_cache()

#
if not os.path.exists(f"image/{name}"):
    os.makedirs(f"image/{name}")

for idx, p in enumerate(tqdm(prompt)):
    scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
    scheduler.set_timesteps(100)

    token = tokenizer([p] * sample_count + [""] * sample_count, padding="max_length", max_length=77, truncation=True, return_tensors="pt").input_ids.to(device)
    embd = text_encoder(token)[0]

    generator = torch.Generator(device=device).manual_seed(seed[idx])
    latent = torch.randn((sample_count, 4, 64, 64), generator=generator, device=device, dtype=torch.float16)
    latent *= scheduler.init_noise_sigma

    for t in scheduler.timesteps:
        latent_input = torch.cat([latent] * 2)
        latent_input = scheduler.scale_model_input(latent_input, timestep=t)

        noise = unet(latent_input, t, encoder_hidden_states=embd).sample
        cond_noise, uncond_noise = noise.chunk(2)
        noise = uncond_noise + 7.5 * (cond_noise - uncond_noise)

        latent = scheduler.step(noise, t, latent).prev_sample

    latent /= 0.18215
    image = vae.decode(latent).sample.detach().cpu().numpy()
    image = ((image + 1) / 2).clip(0, 1).transpose(0, 2, 3, 1)
    image = (image * 255).round().astype("uint8")
    image = [Image.fromarray(i) for i in image]
    for i, img in enumerate(image):
        img.save(f"image/{name}/{idx}_{i}.png")

    torch.cuda.empty_cache()

  0%|          | 0/20 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [3]:
import os
import warnings

warnings.filterwarnings("ignore")

import yaml

from tqdm.auto import tqdm

import torch
from diffusers import UNet2DConditionModel, AutoencoderKL, LMSDiscreteScheduler
from transformers import CLIPTextModel, CLIPTokenizer, CLIPProcessor, CLIPModel

from torchvision import transforms

from PIL import Image

torch.set_grad_enabled(False)

#
name = "default"

with open(f"data/{name}.yaml", 'r', encoding='utf-8') as file:
    config = yaml.safe_load(file)

version = config["version"]
prev_prompt = config["prev_prompt"]
retain_prompt = config["retain_prompt"]

seed = config["seed"]
prompt_count = config["prompt_count"]
sample_count = config["sample_count"]

prompt = prev_prompt + retain_prompt
prompt = prompt[:prompt_count]

#
device = "cuda:0" if torch.cuda.is_available() else "cpu"
unet = UNet2DConditionModel.from_pretrained(version, torch_dtype=torch.float16, subfolder="unet").to(device)
unet.load_state_dict(torch.load(f"model/{name}.pth"))
tokenizer = CLIPTokenizer.from_pretrained(version, torch_dtype=torch.float16, subfolder="tokenizer")
text_encoder = CLIPTextModel.from_pretrained(version, torch_dtype=torch.float16, subfolder="text_encoder").to(device)
vae = AutoencoderKL.from_pretrained(version, torch_dtype=torch.float16, subfolder="vae").to(device)

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

torch.cuda.empty_cache()

#
if not os.path.exists(f"image/{name}"):
    os.makedirs(f"image/{name}")

for idx, p in enumerate(tqdm(prompt)):
    scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
    scheduler.set_timesteps(100)

    token = tokenizer([p] * sample_count + [""] * sample_count, padding="max_length", max_length=77, truncation=True, return_tensors="pt").input_ids.to(device)
    embd = text_encoder(token)[0]

    generator = torch.Generator(device=device).manual_seed(seed[idx])
    latent = torch.randn((sample_count, 4, 64, 64), generator=generator, device=device, dtype=torch.float16)
    latent *= scheduler.init_noise_sigma

    for t in scheduler.timesteps:
        latent_input = torch.cat([latent] * 2)
        latent_input = scheduler.scale_model_input(latent_input, timestep=t)

        noise = unet(latent_input, t, encoder_hidden_states=embd).sample
        cond_noise, uncond_noise = noise.chunk(2)
        noise = uncond_noise + 7.5 * (cond_noise - uncond_noise)

        latent = scheduler.step(noise, t, latent).prev_sample

    latent /= 0.18215
    image = vae.decode(latent).sample.detach().cpu().numpy()
    image = ((image + 1) / 2).clip(0, 1).transpose(0, 2, 3, 1)
    image = (image * 255).round().astype("uint8")
    image = [Image.fromarray(i) for i in image]

    torch.cuda.empty_cache()

    input = processor(text=([p] * sample_count), images=image, return_tensors="pt", padding=True, do_rescale=True)
    output = model(**input)
    image_embd = output.image_embeds
    text_embd = output.text_embeds

    print(p)
    print(list(torch.nn.functional.cosine_similarity(image_embd, text_embd).numpy().round(3)))
    print()

  0%|          | 0/20 [00:00<?, ?it/s]

An image in the style of Mike Worrall
[0.335, 0.315, 0.324, 0.306, 0.326]

An image in the style of Jon McCoy
[0.238, 0.267, 0.287, 0.226, 0.277]

An image in the style of Carne Griffiths
[0.354, 0.368, 0.328, 0.366, 0.359]

An image in the style of Akos Major
[0.32, 0.282, 0.321, 0.323, 0.316]

An image in the style of Warren Ellis
[0.286, 0.305, 0.317, 0.292, 0.317]

An image in the style of Chuck Close
[0.326, 0.317, 0.305, 0.314, 0.305]

An image in the style of Thomas Struth
[0.274, 0.276, 0.281, 0.285, 0.277]

An image in the style of Slim Aarons
[0.34, 0.306, 0.322, 0.318, 0.342]

An image in the style of Aminollah Rezaei
[0.279, 0.28, 0.272, 0.289, 0.27]

An image in the style of Hans Baldung
[0.325, 0.307, 0.313, 0.304, 0.302]

An image in the style of Don Maitz
[0.294, 0.271, 0.303, 0.292, 0.292]

An image in the style of Alessandro Barbucci
[0.268, 0.271, 0.259, 0.274, 0.279]

An image in the style of Herve Groussin
[0.273, 0.271, 0.279, 0.27, 0.306]

An image in the style o

In [1]:
import os
import warnings

warnings.filterwarnings("ignore")

import yaml

from tqdm.auto import tqdm

import torch
from diffusers import UNet2DConditionModel, AutoencoderKL, LMSDiscreteScheduler
from transformers import CLIPTextModel, CLIPTokenizer, CLIPProcessor, CLIPModel

from torchvision import transforms

from PIL import Image

torch.set_grad_enabled(False)

#
name = "default"

with open(f"data/{name}.yaml", 'r', encoding='utf-8') as file:
    config = yaml.safe_load(file)

version = config["version"]
prev_prompt = config["prev_prompt"]
retain_prompt = config["retain_prompt"]

seed = config["seed"]
prompt_count = config["prompt_count"]
sample_count = config["sample_count"]

prompt = prev_prompt + retain_prompt
prompt = prompt[:prompt_count]


#
name = "erase1"

#
device = "cuda:0" if torch.cuda.is_available() else "cpu"
unet = UNet2DConditionModel.from_pretrained(version, torch_dtype=torch.float16, subfolder="unet").to(device)
unet.load_state_dict(torch.load(f"model/{name}.pth"))
tokenizer = CLIPTokenizer.from_pretrained(version, torch_dtype=torch.float16, subfolder="tokenizer")
text_encoder = CLIPTextModel.from_pretrained(version, torch_dtype=torch.float16, subfolder="text_encoder").to(device)
vae = AutoencoderKL.from_pretrained(version, torch_dtype=torch.float16, subfolder="vae").to(device)

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

torch.cuda.empty_cache()

#
if not os.path.exists(f"image/{name}"):
    os.makedirs(f"image/{name}")

for idx, p in enumerate(tqdm(prompt)):
    scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
    scheduler.set_timesteps(100)

    token = tokenizer([p] * sample_count + [""] * sample_count, padding="max_length", max_length=77, truncation=True, return_tensors="pt").input_ids.to(device)
    embd = text_encoder(token)[0]

    generator = torch.Generator(device=device).manual_seed(seed[idx])
    latent = torch.randn((sample_count, 4, 64, 64), generator=generator, device=device, dtype=torch.float16)
    latent *= scheduler.init_noise_sigma

    for t in scheduler.timesteps:
        latent_input = torch.cat([latent] * 2)
        latent_input = scheduler.scale_model_input(latent_input, timestep=t)

        noise = unet(latent_input, t, encoder_hidden_states=embd).sample
        cond_noise, uncond_noise = noise.chunk(2)
        noise = uncond_noise + 7.5 * (cond_noise - uncond_noise)

        latent = scheduler.step(noise, t, latent).prev_sample

    latent /= 0.18215
    image = vae.decode(latent).sample.detach().cpu().numpy()
    image = ((image + 1) / 2).clip(0, 1).transpose(0, 2, 3, 1)
    image = (image * 255).round().astype("uint8")
    image = [Image.fromarray(i) for i in image]

    torch.cuda.empty_cache()

    input = processor(text=([p] * sample_count), images=image, return_tensors="pt", padding=True, do_rescale=True)
    output = model(**input)
    image_embd = output.image_embeds
    text_embd = output.text_embeds

    print(p)
    print(list(torch.nn.functional.cosine_similarity(image_embd, text_embd).numpy().round(3)))
    print()

  0%|          | 0/20 [00:00<?, ?it/s]

An image in the style of Mike Worrall
[0.297, 0.24, 0.215, 0.268, 0.252]

An image in the style of Jon McCoy
[0.228, 0.198, 0.18, 0.215, 0.248]

An image in the style of Carne Griffiths
[0.145, 0.204, 0.152, 0.203, 0.16]

An image in the style of Akos Major
[0.247, 0.16, 0.255, 0.17, 0.25]

An image in the style of Warren Ellis
[0.245, 0.236, 0.211, 0.22, 0.192]

An image in the style of Chuck Close
[0.247, 0.174, 0.262, 0.254, 0.269]

An image in the style of Thomas Struth
[0.235, 0.263, 0.193, 0.252, 0.26]

An image in the style of Slim Aarons
[0.239, 0.236, 0.22, 0.232, 0.211]

An image in the style of Aminollah Rezaei
[0.219, 0.257, 0.205, 0.241, 0.239]

An image in the style of Hans Baldung
[0.244, 0.255, 0.276, 0.295, 0.279]

An image in the style of Don Maitz
[0.294, 0.265, 0.301, 0.295, 0.287]

An image in the style of Alessandro Barbucci
[0.265, 0.253, 0.252, 0.272, 0.282]

An image in the style of Herve Groussin
[0.284, 0.265, 0.283, 0.267, 0.305]

An image in the style of St

In [2]:
import os
import warnings

warnings.filterwarnings("ignore")

import yaml

from tqdm.auto import tqdm

import torch
from diffusers import UNet2DConditionModel, AutoencoderKL, LMSDiscreteScheduler
from transformers import CLIPTextModel, CLIPTokenizer, CLIPProcessor, CLIPModel

from torchvision import transforms

from PIL import Image

torch.set_grad_enabled(False)

#
name = "erase3"

#
unet.load_state_dict(torch.load(f"model/{name}.pth"))

torch.cuda.empty_cache()

#
if not os.path.exists(f"image/{name}"):
    os.makedirs(f"image/{name}")

for idx, p in enumerate(tqdm(prompt)):
    scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
    scheduler.set_timesteps(100)

    token = tokenizer([p] * sample_count + [""] * sample_count, padding="max_length", max_length=77, truncation=True, return_tensors="pt").input_ids.to(device)
    embd = text_encoder(token)[0]

    generator = torch.Generator(device=device).manual_seed(seed[idx])
    latent = torch.randn((sample_count, 4, 64, 64), generator=generator, device=device, dtype=torch.float16)
    latent *= scheduler.init_noise_sigma

    for t in scheduler.timesteps:
        latent_input = torch.cat([latent] * 2)
        latent_input = scheduler.scale_model_input(latent_input, timestep=t)

        noise = unet(latent_input, t, encoder_hidden_states=embd).sample
        cond_noise, uncond_noise = noise.chunk(2)
        noise = uncond_noise + 7.5 * (cond_noise - uncond_noise)

        latent = scheduler.step(noise, t, latent).prev_sample

    latent /= 0.18215
    image = vae.decode(latent).sample.detach().cpu().numpy()
    image = ((image + 1) / 2).clip(0, 1).transpose(0, 2, 3, 1)
    image = (image * 255).round().astype("uint8")
    image = [Image.fromarray(i) for i in image]

    torch.cuda.empty_cache()

    input = processor(text=([p] * sample_count), images=image, return_tensors="pt", padding=True, do_rescale=True)
    output = model(**input)
    image_embd = output.image_embeds
    text_embd = output.text_embeds

    print(p)
    print(list(torch.nn.functional.cosine_similarity(image_embd, text_embd).numpy().round(3)))
    print()

  0%|          | 0/20 [00:00<?, ?it/s]

An image in the style of Mike Worrall
[0.232, 0.205, 0.199, 0.259, 0.224]

An image in the style of Jon McCoy
[0.218, 0.2, 0.215, 0.156, 0.205]

An image in the style of Carne Griffiths
[0.233, 0.187, 0.163, 0.24, 0.186]

An image in the style of Akos Major
[0.172, 0.145, 0.207, 0.162, 0.174]

An image in the style of Warren Ellis
[0.229, 0.211, 0.222, 0.21, 0.187]

An image in the style of Chuck Close
[0.202, 0.203, 0.221, 0.18, 0.182]

An image in the style of Thomas Struth
[0.245, 0.249, 0.215, 0.226, 0.246]

An image in the style of Slim Aarons
[0.226, 0.207, 0.221, 0.178, 0.181]

An image in the style of Aminollah Rezaei
[0.242, 0.23, 0.215, 0.236, 0.255]

An image in the style of Hans Baldung
[0.215, 0.203, 0.233, 0.238, 0.201]

An image in the style of Don Maitz
[0.166, 0.226, 0.227, 0.231, 0.195]

An image in the style of Alessandro Barbucci
[0.228, 0.234, 0.245, 0.243, 0.24]

An image in the style of Herve Groussin
[0.225, 0.262, 0.238, 0.243, 0.245]

An image in the style of 

In [5]:
import numpy as np
np.mean([[0.232, 0.205, 0.199, 0.259, 0.224],
[0.218, 0.2, 0.215, 0.156, 0.205],
[0.233, 0.187, 0.163, 0.24, 0.186],
[0.172, 0.145, 0.207, 0.162, 0.174],
[0.229, 0.211, 0.222, 0.21, 0.187],
[0.202, 0.203, 0.221, 0.18, 0.182],
[0.245, 0.249, 0.215, 0.226, 0.246],
[0.226, 0.207, 0.221, 0.178, 0.181],
[0.242, 0.23, 0.215, 0.236, 0.255],
[0.215, 0.203, 0.233, 0.238, 0.201]])

0.20982

In [None]:
np.mean([[0.232, 0.205, 0.199, 0.259, 0.224],
[0.218, 0.2, 0.215, 0.156, 0.205],
[0.233, 0.187, 0.163, 0.24, 0.186],
[0.172, 0.145, 0.207, 0.162, 0.174],
[0.229, 0.211, 0.222, 0.21, 0.187],
[0.202, 0.203, 0.221, 0.18, 0.182],
[0.245, 0.249, 0.215, 0.226, 0.246],
[0.226, 0.207, 0.221, 0.178, 0.181],
[0.242, 0.23, 0.215, 0.236, 0.255],
[0.215, 0.203, 0.233, 0.238, 0.201]])

[[0.166, 0.226, 0.227, 0.231, 0.195],
[0.228, 0.234, 0.245, 0.243, 0.24],
[0.225, 0.262, 0.238, 0.243, 0.245],
[0.222, 0.209, 0.203, 0.222, 0.217],
[0.219, 0.143, 0.169, 0.127, 0.117],
[0.18, 0.239, 0.197, 0.201, 0.228],
[0.257, 0.185, 0.211, 0.193, 0.223],
[0.221, 0.184, 0.188, 0.23, 0.191],
[0.194, 0.204, 0.191, 0.152, 0.207],
[0.213, 0.251, 0.258, 0.255, 0.213]]

In [6]:
np.mean([[0.166, 0.226, 0.227, 0.231, 0.195],
[0.228, 0.234, 0.245, 0.243, 0.24],
[0.225, 0.262, 0.238, 0.243, 0.245],
[0.222, 0.209, 0.203, 0.222, 0.217],
[0.219, 0.143, 0.169, 0.127, 0.117],
[0.18, 0.239, 0.197, 0.201, 0.228],
[0.257, 0.185, 0.211, 0.193, 0.223],
[0.221, 0.184, 0.188, 0.23, 0.191],
[0.194, 0.204, 0.191, 0.152, 0.207],
[0.213, 0.251, 0.258, 0.255, 0.213]])

0.21124

In [3]:
import os
import warnings

warnings.filterwarnings("ignore")

import yaml

from tqdm.auto import tqdm

import torch
from diffusers import UNet2DConditionModel, AutoencoderKL, LMSDiscreteScheduler
from transformers import CLIPTextModel, CLIPTokenizer, CLIPProcessor, CLIPModel

from torchvision import transforms

from PIL import Image

torch.set_grad_enabled(False)

#
name = "erase4"

#
unet.load_state_dict(torch.load(f"model/{name}.pth"))

torch.cuda.empty_cache()

#
if not os.path.exists(f"image/{name}"):
    os.makedirs(f"image/{name}")

for idx, p in enumerate(tqdm(prompt)):
    scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
    scheduler.set_timesteps(100)

    token = tokenizer([p] * sample_count + [""] * sample_count, padding="max_length", max_length=77, truncation=True, return_tensors="pt").input_ids.to(device)
    embd = text_encoder(token)[0]

    generator = torch.Generator(device=device).manual_seed(seed[idx])
    latent = torch.randn((sample_count, 4, 64, 64), generator=generator, device=device, dtype=torch.float16)
    latent *= scheduler.init_noise_sigma

    for t in scheduler.timesteps:
        latent_input = torch.cat([latent] * 2)
        latent_input = scheduler.scale_model_input(latent_input, timestep=t)

        noise = unet(latent_input, t, encoder_hidden_states=embd).sample
        cond_noise, uncond_noise = noise.chunk(2)
        noise = uncond_noise + 7.5 * (cond_noise - uncond_noise)

        latent = scheduler.step(noise, t, latent).prev_sample

    latent /= 0.18215
    image = vae.decode(latent).sample.detach().cpu().numpy()
    image = ((image + 1) / 2).clip(0, 1).transpose(0, 2, 3, 1)
    image = (image * 255).round().astype("uint8")
    image = [Image.fromarray(i) for i in image]

    torch.cuda.empty_cache()

    input = processor(text=([p] * sample_count), images=image, return_tensors="pt", padding=True, do_rescale=True)
    output = model(**input)
    image_embd = output.image_embeds
    text_embd = output.text_embeds

    print(p)
    print(list(torch.nn.functional.cosine_similarity(image_embd, text_embd).numpy().round(3)))
    print()

  0%|          | 0/20 [00:00<?, ?it/s]

An image in the style of Mike Worrall
[0.305, 0.311, 0.338, 0.297, 0.334]

An image in the style of Jon McCoy
[0.252, 0.291, 0.238, 0.223, 0.271]

An image in the style of Carne Griffiths
[0.341, 0.367, 0.32, 0.367, 0.368]

An image in the style of Akos Major
[0.302, 0.281, 0.304, 0.255, 0.302]

An image in the style of Warren Ellis
[0.291, 0.272, 0.317, 0.307, 0.306]

An image in the style of Chuck Close
[0.283, 0.236, 0.27, 0.275, 0.272]

An image in the style of Thomas Struth
[0.287, 0.273, 0.28, 0.295, 0.281]

An image in the style of Slim Aarons
[0.339, 0.314, 0.281, 0.278, 0.321]

An image in the style of Aminollah Rezaei
[0.251, 0.276, 0.294, 0.278, 0.266]

An image in the style of Hans Baldung
[0.273, 0.291, 0.305, 0.283, 0.306]

An image in the style of Don Maitz
[0.296, 0.281, 0.292, 0.292, 0.291]

An image in the style of Alessandro Barbucci
[0.256, 0.247, 0.252, 0.241, 0.231]

An image in the style of Herve Groussin
[0.219, 0.24, 0.266, 0.237, 0.256]

An image in the style 

In [None]:

0.25-0.28