In [None]:
import yaml
import warnings

warnings.filterwarnings("ignore")

import numpy as np

import torch
from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
from transformers import CLIPModel, CLIPProcessor

from PIL import Image


#
torch.set_grad_enabled(False)

device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipeline = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base", torch_dtype=torch.float16).to(device)

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

torch.cuda.empty_cache()

#
artist_idx = 0
with open(f"data/artist.yaml", 'r', encoding='utf-8') as file:
    artist = yaml.safe_load(file)
artist = artist[artist_idx]

#
sample_count = 10
prompt_idx = 0
prompt = [f"{artist}", f"An image in the style of {artist}", f"An image depicting {artist}"]

image = []
for _ in range(sample_count):
    scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
    scheduler.set_timesteps(100)

    cfg_prompt = [prompt[prompt_idx]] + [""]
    token = pipeline.tokenizer(cfg_prompt, padding="max_length", max_length=77, truncation=True, return_tensors="pt").input_ids.to(device)
    embd = pipeline.text_encoder(token)[0]

    latent = torch.randn((1, 4, 64, 64), device=device, dtype=torch.float16) * scheduler.init_noise_sigma

    for t in scheduler.timesteps:
        latent_input = torch.cat([latent] * 2)
        latent_input = scheduler.scale_model_input(latent_input, timestep=t)

        noise = pipeline.unet(latent_input, t, encoder_hidden_states=embd).sample
        cond_noise, uncond_noise = noise.chunk(2)
        noise = uncond_noise + 7.5 * (cond_noise - uncond_noise)

        latent = scheduler.step(noise, t, latent).prev_sample

    latent /= 0.18215
    image.append(pipeline.vae.decode(latent).sample.detach().cpu()[0])

image = np.stack(image)
image = ((image + 1) / 2).clip(0, 1).transpose(0, 2, 3, 1)
image = (image * 255).round().astype("uint8")
image = [Image.fromarray(i) for i in image]

#
print(f"#\nPrompt: {prompt[prompt_idx]}\nScore:")
for p in prompt:
    input = processor(text=[p] * sample_count, images=image, return_tensors="pt", padding=True, do_rescale=True).to(device)
    output = model(**input)
    image_embd = output.image_embeds
    text_embd = output.text_embeds
    
    score = torch.nn.functional.cosine_similarity(image_embd, text_embd).detach().cpu().numpy()
    for s in score:
        print(f"{s:.3f}", end=" ")
    print(f"\tMean: {score.mean():.3f}\tStd: {score.std(ddof=1):.3f}\t{p}")

for i in image:
    display(i)

In [1]:
import yaml
import warnings

warnings.filterwarnings("ignore")

import torch
from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
from transformers import CLIPModel, CLIPProcessor

from PIL import Image


#
torch.set_grad_enabled(False)

device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipeline = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base", torch_dtype=torch.float16).to(device)
pipeline.unet.load_state_dict(torch.load(f"model/erase10.pth"))

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

torch.cuda.empty_cache()

#
with open(f"data/artist.yaml", 'r', encoding='utf-8') as file:
    artist = yaml.safe_load(file)
a = artist[0]
a = "Pablo Picasso"

#
sample_count = 10
prompt = [f"{a}"] * sample_count
# prompt = [f"An image in the style of {a}"] * sample_count
# prompt = [f"An image depicting {a}"] * sample_count

scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
scheduler.set_timesteps(100)

cfg_prompt = prompt + [""] * sample_count
token = pipeline.tokenizer(cfg_prompt, padding="max_length", max_length=77, truncation=True, return_tensors="pt").input_ids.to(device)
embd = pipeline.text_encoder(token)[0]

latent = torch.randn((sample_count, 4, 64, 64), device=device, dtype=torch.float16) * scheduler.init_noise_sigma

for t in scheduler.timesteps:
    latent_input = torch.cat([latent] * 2)
    latent_input = scheduler.scale_model_input(latent_input, timestep=t)

    noise = pipeline.unet(latent_input, t, encoder_hidden_states=embd).sample
    cond_noise, uncond_noise = noise.chunk(2)
    noise = uncond_noise + 7.5 * (cond_noise - uncond_noise)

    latent = scheduler.step(noise, t, latent).prev_sample

latent /= 0.18215
image = pipeline.vae.decode(latent).sample.detach().cpu().numpy()
image = ((image + 1) / 2).clip(0, 1).transpose(0, 2, 3, 1)
image = (image * 255).round().astype("uint8")
image = [Image.fromarray(i) for i in image]

print(f"#\nPrompt: {prompt[0]}\nScore:")
prompt = [f"{a}", f"An image in the style of {a}", f"An image depicting {a}"]
for p in prompt:
    input = processor(text=[p] * sample_count, images=image, return_tensors="pt", padding=True, do_rescale=True).to(device)
    output = model(**input)
    image_embd = output.image_embeds
    text_embd = output.text_embeds
    score = torch.nn.functional.cosine_similarity(image_embd, text_embd).detach().cpu().numpy()
    
    for s in score:
        print(f"{s:.3f}", end=" ")
    print(f"\tMean: {score.mean():.3f}\tStd: {score.std(ddof=1):.3f}\t{p}")

Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

#
Prompt: Pablo Picasso
Score:
0.276 0.259 0.336 0.242 0.242 0.244 0.301 0.274 0.285 0.272 	Mean: 0.273	Std: 0.030	Pablo Picasso
0.253 0.216 0.284 0.199 0.258 0.245 0.292 0.257 0.229 0.288 	Mean: 0.252	Std: 0.031	An image in the style of Pablo Picasso
0.291 0.250 0.345 0.238 0.248 0.249 0.303 0.295 0.279 0.277 	Mean: 0.277	Std: 0.033	An image depicting Pablo Picasso


In [None]:
import warnings

warnings.filterwarnings("ignore")

import yaml

from tqdm.auto import tqdm

import torch
from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
from transformers import CLIPModel, CLIPProcessor

from PIL import Image

#
torch.set_grad_enabled(False)

device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipeline = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base", torch_dtype=torch.float16).to(device)

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

torch.cuda.empty_cache()

#
with open(f"data/artist.yaml", 'r', encoding='utf-8') as file:
    artist = yaml.safe_load(file)

valid_artist = []
for a in artist:
    prompt = [f"An image in the style of {a}"]

    scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
    scheduler.set_timesteps(100)

    cfg_prompt = prompt + [""]
    token = pipeline.tokenizer(cfg_prompt, padding="max_length", max_length=77, truncation=True, return_tensors="pt").input_ids.to(device)
    embd = pipeline.text_encoder(token)[0]

    latent = torch.randn((1, 4, 64, 64), device=device, dtype=torch.float16) * scheduler.init_noise_sigma

    for t in scheduler.timesteps:
        latent_input = torch.cat([latent] * 2)
        latent_input = scheduler.scale_model_input(latent_input, timestep=t)

        noise = pipeline.unet(latent_input, t, encoder_hidden_states=embd).sample
        cond_noise, uncond_noise = noise.chunk(2)
        noise = uncond_noise + 7.5 * (cond_noise - uncond_noise)

        latent = scheduler.step(noise, t, latent).prev_sample

    latent /= 0.18215
    image = pipeline.vae.decode(latent).sample.detach().cpu().numpy()
    image = ((image + 1) / 2).clip(0, 1).transpose(0, 2, 3, 1)
    image = (image * 255).round().astype("uint8")
    image = [Image.fromarray(i) for i in image]

    torch.cuda.empty_cache()

    input = processor(text=prompt, images=image, return_tensors="pt", padding=True, do_rescale=True).to(device)
    output = model(**input)
    image_embd = output.image_embeds
    text_embd = output.text_embeds
    score = torch.nn.functional.cosine_similarity(image_embd, text_embd).detach().cpu().numpy().item()
    print(score)

    if score > 0.29:
        valid_artist.append(a)

#
# with open(f"data/valid_artist.yaml", 'w') as file:
#     yaml.dump(valid_artist, file)

In [None]:
import random
import warnings

warnings.filterwarnings("ignore")

import yaml

import torch
from diffusers import UNet2DConditionModel
from transformers import CLIPTextModel, CLIPTokenizer

#
torch.set_grad_enabled(False)

version = "stabilityai/stable-diffusion-2-1-base"

device = "cuda:0" if torch.cuda.is_available() else "cpu"
unet = UNet2DConditionModel.from_pretrained(version, subfolder="unet").to(device)
tokenizer = CLIPTokenizer.from_pretrained(version, subfolder="tokenizer")
text_encoder = CLIPTextModel.from_pretrained(version, subfolder="text_encoder").to(device)

#
with open(f"data/valid_artist.yaml", 'r', encoding='utf-8') as file:
    artist = yaml.safe_load(file)
random.shuffle(artist)
with open(f"data/valid_artist.yaml", 'w') as file:
    yaml.dump(artist, file)
# artist = artist[:1043]

artist_count = 3
prompt = ["An image in the style of " + a for a in artist]
prev_prompt = prompt[:artist_count]
new_prompt = ["art"] * artist_count
retain_prompt = prompt[artist_count:]

#
lamb = 0.5
erase_scale = 1
preserve_scale = 0.1
with_key = True

ca_layer = []
for n, module in unet.named_modules():
    if n[-5:] != "attn2": continue
    ca_layer.append(module)

value_layer = [layer.to_v for layer in ca_layer]
target_layer = value_layer

if with_key:
    key_layer = [layer.to_k for layer in ca_layer]
    target_layer += key_layer

prev_token = tokenizer(prev_prompt, padding="max_length", max_length=77, truncation=True, return_tensors="pt").input_ids.to(device)
prev_embd = text_encoder(prev_token)[0].permute(0, 2, 1)

new_token = tokenizer(new_prompt, padding="max_length", max_length=77, truncation=True, return_tensors="pt").input_ids.to(device)
new_embd = text_encoder(new_token)[0].permute(0, 2, 1)

m2 = (prev_embd @ prev_embd.permute(0, 2, 1)).sum(0) * erase_scale
m2 += lamb * torch.eye(m2.shape[0], device=device)

m3 = (new_embd @ prev_embd.permute(0, 2, 1)).sum(0) * erase_scale
m3 += lamb * torch.eye(m3.shape[0], device=device)

if retain_prompt:
    retain_token = tokenizer(retain_prompt, padding="max_length", max_length=77, truncation=True, return_tensors="pt").input_ids.to(device)
    retain_embd = text_encoder(retain_token)[0].permute(0, 2, 1)

    m2 += (retain_embd @ retain_embd.permute(0, 2, 1)).sum(0) * preserve_scale
    m3 += (retain_embd @ retain_embd.permute(0, 2, 1)).sum(0) * preserve_scale

for layer in target_layer:
    m1 = layer.weight @ m3
    layer.weight = torch.nn.Parameter((m1 @ torch.inverse(m2)).detach())

torch.save(unet.state_dict(), f"model/erase10.pth")

In [None]:
import warnings

warnings.filterwarnings("ignore")

import yaml

import numpy as np

import torch
from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
from transformers import CLIPProcessor, CLIPModel

from PIL import Image

torch.set_grad_enabled(False)

#
version = "stabilityai/stable-diffusion-2-1-base"

device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipeline = StableDiffusionPipeline.from_pretrained(version, torch_dtype=torch.float16).to(device)

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

torch.cuda.empty_cache()

#
artist_count = 3
sample_count = 5

with open(f"data/valid_artist.yaml", 'r', encoding='utf-8') as file:
    artist = yaml.safe_load(file)[:artist_count * 2]

#
default_score = []
for a in artist:
    prompt = [f"An image in the style of {a}"] * sample_count

    scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
    scheduler.set_timesteps(100)

    cfg_prompt = prompt + [""] * sample_count
    token = pipeline.tokenizer(cfg_prompt, padding="max_length", max_length=77, truncation=True, return_tensors="pt").input_ids.to(device)
    embd = pipeline.text_encoder(token)[0]

    latent = torch.randn((sample_count, 4, 64, 64), device=device, dtype=torch.float16) * scheduler.init_noise_sigma

    for t in scheduler.timesteps:
        latent_input = torch.cat([latent] * 2)
        latent_input = scheduler.scale_model_input(latent_input, timestep=t)

        noise = pipeline.unet(latent_input, t, encoder_hidden_states=embd).sample
        cond_noise, uncond_noise = noise.chunk(2)
        noise = uncond_noise + 7.5 * (cond_noise - uncond_noise)

        latent = scheduler.step(noise, t, latent).prev_sample

    latent /= 0.18215
    image = pipeline.vae.decode(latent).sample.detach().cpu().numpy()
    image = ((image + 1) / 2).clip(0, 1).transpose(0, 2, 3, 1)
    image = (image * 255).round().astype("uint8")
    image = [Image.fromarray(i) for i in image]

    input = processor(text=prompt, images=image, return_tensors="pt", padding=True, do_rescale=True).to(device)
    output = model(**input)
    image_embd = output.image_embeds
    text_embd = output.text_embeds
    score = list(torch.nn.functional.cosine_similarity(image_embd, text_embd).detach().cpu().numpy().round(4))
    default_score.append(score)

    torch.cuda.empty_cache()

#
pipeline.unet.load_state_dict(torch.load(f"model/erase10.pth"))
torch.cuda.empty_cache()

#
erased_score = []
for a in artist:
    prompt = [f"An image in the style of {a}"] * sample_count

    scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
    scheduler.set_timesteps(100)

    cfg_prompt = prompt + [""] * sample_count
    token = pipeline.tokenizer(cfg_prompt, padding="max_length", max_length=77, truncation=True, return_tensors="pt").input_ids.to(device)
    embd = pipeline.text_encoder(token)[0]

    latent = torch.randn((sample_count, 4, 64, 64), device=device, dtype=torch.float16) * scheduler.init_noise_sigma

    for t in scheduler.timesteps:
        latent_input = torch.cat([latent] * 2)
        latent_input = scheduler.scale_model_input(latent_input, timestep=t)

        noise = pipeline.unet(latent_input, t, encoder_hidden_states=embd).sample
        cond_noise, uncond_noise = noise.chunk(2)
        noise = uncond_noise + 7.5 * (cond_noise - uncond_noise)

        latent = scheduler.step(noise, t, latent).prev_sample

    latent /= 0.18215
    image = pipeline.vae.decode(latent).sample.detach().cpu().numpy()
    image = ((image + 1) / 2).clip(0, 1).transpose(0, 2, 3, 1)
    image = (image * 255).round().astype("uint8")
    image = [Image.fromarray(i) for i in image]

    input = processor(text=prompt, images=image, return_tensors="pt", padding=True, do_rescale=True).to(device)
    output = model(**input)
    image_embd = output.image_embeds
    text_embd = output.text_embeds
    score = list(torch.nn.functional.cosine_similarity(image_embd, text_embd).detach().cpu().numpy().round(4))
    erased_score.append(score)

    torch.cuda.empty_cache()

#
print(default_score)
print(erased_score)
print()
default_std = []
erased_std = []
for i in range(artist_count):
    d_mean = np.mean(default_score[i])
    d_std = np.std(default_score[i], ddof=1)
    default_std.append(d_std)
    e_mean = np.mean(erased_score[i])
    e_std = np.std(erased_score[i], ddof=1)
    erased_std.append(e_std)
    print(f"{d_mean:.3f} ({d_std:.3f}) -> {e_mean:.3f} ({e_std:.3f})    diff: {d_mean - e_mean:.3f}")
print()
d_mean = np.mean(default_score[:artist_count])
e_mean = np.mean(erased_score[:artist_count])
print(f"{d_mean:.3f} ({np.mean(default_std):.3f}) -> {e_mean:.3f} ({np.mean(default_std):.3f})    diff: {d_mean - e_mean:.3f}")
print()
default_std = []
erased_std = []
for i in range(artist_count, artist_count * 2):
    d_mean = np.mean(default_score[i])
    d_std = np.std(default_score[i], ddof=1)
    default_std.append(d_std)
    e_mean = np.mean(erased_score[i])
    e_std = np.std(erased_score[i], ddof=1)
    erased_std.append(e_std)
    print(f"{d_mean:.3f} ({d_std:.3f}) -> {e_mean:.3f} ({e_std:.3f})    diff: {d_mean - e_mean:.3f}")
print()
d_mean = np.mean(default_score[artist_count:])
e_mean = np.mean(erased_score[artist_count:])
print(f"{d_mean:.3f} ({np.mean(default_std):.3f}) -> {e_mean:.3f} ({np.mean(default_std):.3f})    diff: {d_mean - e_mean:.3f}")

Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

[[0.268, 0.3222, 0.2948, 0.2825, 0.2841], [0.3491, 0.3458, 0.3496, 0.3346, 0.3357], [0.3214, 0.3216, 0.3018, 0.307, 0.3143], [0.2991, 0.2867, 0.3015, 0.3022, 0.2997], [0.3223, 0.3367, 0.3112, 0.3318, 0.3412], [0.3484, 0.3328, 0.3525, 0.3367, 0.3284]]
[[0.1604, 0.1792, 0.2295, 0.2505, 0.1755], [0.1709, 0.2236, 0.1677, 0.1855, 0.2037], [0.1941, 0.2267, 0.2006, 0.1538, 0.1971], [0.2775, 0.2895, 0.2663, 0.2703, 0.2929], [0.3436, 0.3392, 0.3344, 0.3136, 0.3237], [0.3442, 0.3432, 0.3425, 0.3438, 0.3198]]

0.290 (0.020) -> 0.199 (0.039)    diff: 0.091
0.343 (0.007) -> 0.190 (0.023)    diff: 0.153
0.313 (0.009) -> 0.194 (0.026)    diff: 0.119

0.315 (0.012) -> 0.195 (0.012)    diff: 0.121

0.298 (0.006) -> 0.279 (0.012)    diff: 0.019
0.329 (0.012) -> 0.331 (0.012)    diff: -0.002
0.340 (0.010) -> 0.339 (0.011)    diff: 0.001

0.322 (0.010) -> 0.316 (0.010)    diff: 0.006


In [None]:
artist_count = 10