In [None]:
import yaml
import warnings

warnings.filterwarnings("ignore")

import numpy as np

import torch
from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
from transformers import CLIPModel, CLIPProcessor

from PIL import Image


#
torch.set_grad_enabled(False)

device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipeline = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base", torch_dtype=torch.float16).to(device)

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

torch.cuda.empty_cache()

#
artist_idx = 6
with open(f"data/valid_artist.yaml", 'r', encoding='utf-8') as file:
    artist = yaml.safe_load(file)
artist = artist[artist_idx]

#
sample_count = 10
prompt = [f"An image in the style of {artist}"]

image = []
for _ in range(sample_count):
    scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
    scheduler.set_timesteps(100)

    cfg_prompt = prompt + [""]
    token = pipeline.tokenizer(cfg_prompt, padding="max_length", max_length=77, truncation=True, return_tensors="pt").input_ids.to(device)
    embd = pipeline.text_encoder(token)[0]

    latent = torch.randn((1, 4, 64, 64), device=device, dtype=torch.float16) * scheduler.init_noise_sigma

    for t in scheduler.timesteps:
        latent_input = torch.cat([latent] * 2)
        latent_input = scheduler.scale_model_input(latent_input, timestep=t)

        noise = pipeline.unet(latent_input, t, encoder_hidden_states=embd).sample
        cond_noise, uncond_noise = noise.chunk(2)
        noise = uncond_noise + 7.5 * (cond_noise - uncond_noise)

        latent = scheduler.step(noise, t, latent).prev_sample

    latent /= 0.18215
    image.append(pipeline.vae.decode(latent).sample.detach().cpu()[0])

image = np.stack(image)
image = ((image + 1) / 2).clip(0, 1).transpose(0, 2, 3, 1)
image = (image * 255).round().astype("uint8")
image = [Image.fromarray(i) for i in image]

#
input = processor(text=prompt * sample_count, images=image, return_tensors="pt", padding=True, do_rescale=True).to(device)
output = model(**input)
image_embd = output.image_embeds
text_embd = output.text_embeds

score = torch.nn.functional.cosine_similarity(image_embd, text_embd).detach().cpu().numpy()
for s in score:
    print(f"{s:.3f}", end=" ")
print(f"\tMean: {score.mean():.3f}\tStd: {score.std(ddof=1):.3f}\t{artist}")

for i in image:
    display(i)

In [1]:
import random
import warnings

warnings.filterwarnings("ignore")

import yaml

import torch
from diffusers import UNet2DConditionModel
from transformers import CLIPTextModel, CLIPTokenizer

#
torch.set_grad_enabled(False)

version = "stabilityai/stable-diffusion-2-1-base"

device = "cuda:0" if torch.cuda.is_available() else "cpu"
unet = UNet2DConditionModel.from_pretrained(version, subfolder="unet").to(device)
tokenizer = CLIPTokenizer.from_pretrained(version, subfolder="tokenizer")
text_encoder = CLIPTextModel.from_pretrained(version, subfolder="text_encoder").to(device)

#
with open(f"data/valid_artist.yaml", 'r', encoding='utf-8') as file:
    valid_artist = yaml.safe_load(file)
with open(f"data/invalid_artist.yaml", 'r', encoding='utf-8') as file:
    invalid_artist = yaml.safe_load(file)

# erase_artist = ["Ernie Barnes", "Alain Laboile", "Ron Mueck", "Mickalene Thomas"]
# retain_artist = list(set(valid_artist) - set(erase_artist)) + invalid_artist
# erase_artist = ["Ernie Barnes", "Alain Laboile"]

# erase_artist = ["Ernie Barnes"]
# retain_artist = list(set(valid_artist) - set(erase_artist)) + invalid_artist

erase_artist = valid_artist
retain_artist = []


prev_prompt = [f"An image in the style of {a}" for a in erase_artist]
new_prompt = ["Art"] * len(prev_prompt)
retain_prompt = [f"An image in the style of {a}" for a in retain_artist]

#
lamb = 0.5
erase_scale = 0.5
preserve_scale = 0.1
with_key = True

ca_layer = []
for n, module in unet.named_modules():
    if n[-5:] != "attn2": continue
    ca_layer.append(module)

value_layer = [layer.to_v for layer in ca_layer]
target_layer = value_layer

if with_key:
    key_layer = [layer.to_k for layer in ca_layer]
    target_layer += key_layer

m2 = lamb * torch.eye(1024, device=device)
m3 = lamb * torch.eye(1024, device=device)

count = (len(prev_prompt) - 1) // 300 + 1
for c in range(count):
    prev_token = tokenizer(prev_prompt[300 * c:300 * (c + 1)], padding="max_length", max_length=77, truncation=True, return_tensors="pt").input_ids.to(device)
    prev_embd = text_encoder(prev_token)[0].permute(0, 2, 1)

    new_token = tokenizer(new_prompt[300 * c:300 * (c + 1)], padding="max_length", max_length=77, truncation=True, return_tensors="pt").input_ids.to(device)
    new_embd = text_encoder(new_token)[0].permute(0, 2, 1)

    m2 += (prev_embd @ prev_embd.permute(0, 2, 1)).sum(0) * erase_scale
    m3 += (new_embd @ prev_embd.permute(0, 2, 1)).sum(0) * erase_scale

if retain_prompt:
    count = (len(retain_prompt) - 1) // 300 + 1
    for c in range(count):
        retain_token = tokenizer(retain_prompt[300 * c:300 * (c + 1)], padding="max_length", max_length=77, truncation=True, return_tensors="pt").input_ids.to(device)
        retain_embd = text_encoder(retain_token)[0].permute(0, 2, 1)

        m2 += (retain_embd @ retain_embd.permute(0, 2, 1)).sum(0) * preserve_scale
        m3 += (retain_embd @ retain_embd.permute(0, 2, 1)).sum(0) * preserve_scale

for layer in target_layer:
    m1 = layer.weight @ m3
    layer.weight = torch.nn.Parameter((m1 @ torch.inverse(m2)).detach())

#
print(f"{lamb}\t{erase_scale}\t{preserve_scale}\t{with_key}")

torch.save(unet.state_dict(), f"model/erase.pth")

KeyboardInterrupt: 

In [None]:
import yaml
import warnings

warnings.filterwarnings("ignore")

import numpy as np

import torch
from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
from transformers import CLIPModel, CLIPProcessor

from PIL import Image


#
torch.set_grad_enabled(False)

device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipeline = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base", torch_dtype=torch.float16).to(device)
pipeline.unet.load_state_dict(torch.load(f"model/erase.pth"))

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

torch.cuda.empty_cache()

#
# artist_idx = 0
# with open(f"data/valid_artist.yaml", 'r', encoding='utf-8') as file:
#     artist = yaml.safe_load(file)
# artist = artist[artist_idx]
artist = "Ernie Barnes"

#
sample_count = 10
prompt = [f"An image in the style of {artist}"]

image = []
for _ in range(sample_count):
    scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
    scheduler.set_timesteps(100)

    cfg_prompt = prompt + [""]
    token = pipeline.tokenizer(cfg_prompt, padding="max_length", max_length=77, truncation=True, return_tensors="pt").input_ids.to(device)
    embd = pipeline.text_encoder(token)[0]

    latent = torch.randn((1, 4, 64, 64), device=device, dtype=torch.float16) * scheduler.init_noise_sigma

    for t in scheduler.timesteps:
        latent_input = torch.cat([latent] * 2)
        latent_input = scheduler.scale_model_input(latent_input, timestep=t)

        noise = pipeline.unet(latent_input, t, encoder_hidden_states=embd).sample
        cond_noise, uncond_noise = noise.chunk(2)
        noise = uncond_noise + 7.5 * (cond_noise - uncond_noise)

        latent = scheduler.step(noise, t, latent).prev_sample

    latent /= 0.18215
    image.append(pipeline.vae.decode(latent).sample.detach().cpu()[0])

image = np.stack(image)
image = ((image + 1) / 2).clip(0, 1).transpose(0, 2, 3, 1)
image = (image * 255).round().astype("uint8")
image = [Image.fromarray(i) for i in image]

#
input = processor(text=prompt * sample_count, images=image, return_tensors="pt", padding=True, do_rescale=True).to(device)
output = model(**input)
image_embd = output.image_embeds
text_embd = output.text_embeds

score = torch.nn.functional.cosine_similarity(image_embd, text_embd).detach().cpu().numpy()
for s in score:
    print(f"{s:.3f}", end=" ")
print(f"\tMean: {score.mean():.3f}\tStd: {score.std(ddof=1):.3f}")

for i in image:
    display(i)

In [1]:
import time

import torch
import open_clip
from transformers import CLIPModel, CLIPProcessor
from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler, PNDMScheduler, LMSDiscreteScheduler

#
start_time = time.time()
torch.set_grad_enabled(False)

version = "stabilityai/stable-diffusion-2-1-base"
device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipeline = StableDiffusionPipeline.from_pretrained(version, torch_dtype=torch.float16).to(device)
pipeline.scheduler = PNDMScheduler.from_config(pipeline.scheduler.config)

Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

In [None]:
import time
import warnings
warnings.filterwarnings("ignore")

import torch
import open_clip
from transformers import CLIPModel, CLIPProcessor
from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler, PNDMScheduler, LMSDiscreteScheduler

#
start_time = time.time()
torch.set_grad_enabled(False)

version = "stabilityai/stable-diffusion-2-1-base"
device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipeline = StableDiffusionPipeline.from_pretrained(version, torch_dtype=torch.float16).to(device)
pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)

artist = "Frank Tinsley"
prompt = f"An image in the style of {artist}"

image_count = 5
images = pipeline(prompt, num_inference_steps=50, num_images_per_prompt=image_count).images

torch.cuda.empty_cache()
end_time = time.time()
print(f"#\nScheduler: {pipeline.scheduler._class_name}\nTime: {end_time - start_time:.1f}")
print(f"Artist: {artist}\nScore:")

#
model, _, preprocess = open_clip.create_model_and_transforms('ViT-H-14', pretrained="laion2b_s32b_b79k")
model.eval()
tokenizer = open_clip.get_tokenizer('ViT-H-14')

image_embds = model.encode_image(torch.cat([preprocess(image).unsqueeze(0) for image in images], dim=0))
image_embds /= image_embds.norm(dim=1, keepdim=True)

text_embd = model.encode_text(tokenizer(prompt))[0]
text_embd /= text_embd.norm()

scores = []
for image_embd in image_embds:
    scores.append(torch.sum(image_embd * text_embd).item())
scores = torch.tensor(scores)
for score in scores:
    print(f"{score:.3f}", end=" ")
print(f"\tMean: {scores.mean():.3f}\tViT-H-14")

#
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

input = processor(text=[prompt] * image_count, images=images, return_tensors="pt", padding=True, do_rescale=True).to(device)
output = model(**input)
image_embds = output.image_embeds
text_embds = output.text_embeds

scores = torch.nn.functional.cosine_similarity(image_embds, text_embds).detach().cpu().numpy()
for score in scores:
    print(f"{score:.3f}", end=" ")
print(f"\tMean: {scores.mean():.3f}")

for image in images:
    display(image)

In [None]:
import time
import warnings
warnings.filterwarnings("ignore")

import torch
import open_clip
from transformers import CLIPModel, CLIPProcessor
from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler, PNDMScheduler, LMSDiscreteScheduler

#
torch.set_grad_enabled(False)

version = "stabilityai/stable-diffusion-2-1-base"
device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipeline = StableDiffusionPipeline.from_pretrained(version, torch_dtype=torch.float16).to(device)

artist = "Frank Tinsley"
prompt = f"An image in the style of {artist}"

image_count = 5
images = pipeline(prompt, num_inference_steps=50, num_images_per_prompt=image_count).images

torch.cuda.empty_cache()
print(f"Artist: {artist}\nScore:")  

#
model, _, preprocess = open_clip.create_model_and_transforms('ViT-H-14', pretrained="laion2b_s32b_b79k")
model.eval()
tokenizer = open_clip.get_tokenizer('ViT-H-14')

image_embds = model.encode_image(torch.cat([preprocess(image).unsqueeze(0) for image in images], dim=0))
image_embds /= image_embds.norm(dim=1, keepdim=True)

text_embd = model.encode_text(tokenizer(prompt))[0]
text_embd /= text_embd.norm()

scores = []
for image_embd in image_embds:
    scores.append(torch.sum(image_embd * text_embd).item())
scores = torch.tensor(scores)
for score in scores:
    print(f"{score:.3f}", end=" ")
print(f"\tMean: {scores.mean():.3f}")

for image in images:
    display(image)

In [None]:
import random
import warnings

warnings.filterwarnings("ignore")

import yaml

import torch
from diffusers import UNet2DConditionModel
from transformers import CLIPTextModel, CLIPTokenizer

#
torch.set_grad_enabled(False)

version = "stabilityai/stable-diffusion-2-1-base"

device = "cuda:0" if torch.cuda.is_available() else "cpu"
unet = UNet2DConditionModel.from_pretrained(version, subfolder="unet").to(device)
tokenizer = CLIPTokenizer.from_pretrained(version, subfolder="tokenizer")
text_encoder = CLIPTextModel.from_pretrained(version, subfolder="text_encoder").to(device)

#
with open(f"data/valid_artist.yaml", 'r', encoding='utf-8') as file:
    artist = yaml.safe_load(file)
random.shuffle(artist)
with open(f"data/valid_artist.yaml", 'w') as file:
    yaml.dump(artist, file)

prompt = ["An image in the style of " + a for a in artist]
prev_prompt = prompt[:10]
new_prompt = ["art"] * 10
retain_prompt = prompt[10:]

#
lamb = 0.5
erase_scale = 1
preserve_scale = 0.1
with_key = True

ca_layer = []
for n, module in unet.named_modules():
    if n[-5:] != "attn2": continue
    ca_layer.append(module)

value_layer = [layer.to_v for layer in ca_layer]
target_layer = value_layer

if with_key:
    key_layer = [layer.to_k for layer in ca_layer]
    target_layer += key_layer

prev_token = tokenizer(prev_prompt, padding="max_length", max_length=77, truncation=True, return_tensors="pt").input_ids.to(device)
prev_embd = text_encoder(prev_token)[0].permute(0, 2, 1)

new_token = tokenizer(new_prompt, padding="max_length", max_length=77, truncation=True, return_tensors="pt").input_ids.to(device)
new_embd = text_encoder(new_token)[0].permute(0, 2, 1)

m2 = (prev_embd @ prev_embd.permute(0, 2, 1)).sum(0) * erase_scale
m2 += lamb * torch.eye(m2.shape[0], device=device)

m3 = (new_embd @ prev_embd.permute(0, 2, 1)).sum(0) * erase_scale
m3 += lamb * torch.eye(m3.shape[0], device=device)

if retain_prompt:
    retain_token = tokenizer(retain_prompt, padding="max_length", max_length=77, truncation=True, return_tensors="pt").input_ids.to(device)
    retain_embd = text_encoder(retain_token)[0].permute(0, 2, 1)

    m2 += (retain_embd @ retain_embd.permute(0, 2, 1)).sum(0) * preserve_scale
    m3 += (retain_embd @ retain_embd.permute(0, 2, 1)).sum(0) * preserve_scale

for layer in target_layer:
    m1 = layer.weight @ m3
    layer.weight = torch.nn.Parameter((m1 @ torch.inverse(m2)).detach())

torch.save(unet.state_dict(), f"model/erase10.pth")

In [1]:
import warnings

warnings.filterwarnings("ignore")

import yaml

import numpy as np

import torch
from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
from transformers import CLIPProcessor, CLIPModel

from PIL import Image

torch.set_grad_enabled(False)

#
version = "stabilityai/stable-diffusion-2-1-base"

device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipeline = StableDiffusionPipeline.from_pretrained(version, torch_dtype=torch.float16).to(device)

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

torch.cuda.empty_cache()

#
artist = ["Liora Jansen", "Esmond Vale", "Alira Keats", "Soren Elwood", "Mirae Lorne", 
          "Tavish Cromwell", "Elara Fawn", "Casper Thorne", "Vera Lysander", "Kaelan Rivers"]
artist = ["Evelyn Hartley", "Liam Whitaker", "Ava Sterling", "Mason Caldwell", "Isla Kensington", 
         "Ethan Marlowe", "Zoe Whitman", "Leo Farnsworth", "Clara Beaumont", "Asher Langford"]

#
sample_count = 5

default_score = []
for a in artist:
    prompt = [f"An image in the style of {a}"] * sample_count

    scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
    scheduler.set_timesteps(100)

    cfg_prompt = prompt + [""] * sample_count
    token = pipeline.tokenizer(cfg_prompt, padding="max_length", max_length=77, truncation=True, return_tensors="pt").input_ids.to(device)
    embd = pipeline.text_encoder(token)[0]

    latent = torch.randn((sample_count, 4, 64, 64), device=device, dtype=torch.float16) * scheduler.init_noise_sigma

    for t in scheduler.timesteps:
        latent_input = torch.cat([latent] * 2)
        latent_input = scheduler.scale_model_input(latent_input, timestep=t)

        noise = pipeline.unet(latent_input, t, encoder_hidden_states=embd).sample
        cond_noise, uncond_noise = noise.chunk(2)
        noise = uncond_noise + 7.5 * (cond_noise - uncond_noise)

        latent = scheduler.step(noise, t, latent).prev_sample

    latent /= 0.18215
    image = pipeline.vae.decode(latent).sample.detach().cpu().numpy()
    image = ((image + 1) / 2).clip(0, 1).transpose(0, 2, 3, 1)
    image = (image * 255).round().astype("uint8")
    image = [Image.fromarray(i) for i in image]

    input = processor(text=prompt, images=image, return_tensors="pt", padding=True, do_rescale=True).to(device)
    output = model(**input)
    image_embd = output.image_embeds
    text_embd = output.text_embeds
    score = list(torch.nn.functional.cosine_similarity(image_embd, text_embd).detach().cpu().numpy().round(4))
    default_score.append(score)

    torch.cuda.empty_cache()

#
pipeline.unet.load_state_dict(torch.load(f"model/erase10.pth"))
torch.cuda.empty_cache()

#
erased_score = []
for a in artist:
    prompt = [f"An image in the style of {a}"] * sample_count

    scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
    scheduler.set_timesteps(100)

    cfg_prompt = prompt + [""] * sample_count
    token = pipeline.tokenizer(cfg_prompt, padding="max_length", max_length=77, truncation=True, return_tensors="pt").input_ids.to(device)
    embd = pipeline.text_encoder(token)[0]

    latent = torch.randn((sample_count, 4, 64, 64), device=device, dtype=torch.float16) * scheduler.init_noise_sigma

    for t in scheduler.timesteps:
        latent_input = torch.cat([latent] * 2)
        latent_input = scheduler.scale_model_input(latent_input, timestep=t)

        noise = pipeline.unet(latent_input, t, encoder_hidden_states=embd).sample
        cond_noise, uncond_noise = noise.chunk(2)
        noise = uncond_noise + 7.5 * (cond_noise - uncond_noise)

        latent = scheduler.step(noise, t, latent).prev_sample

    latent /= 0.18215
    image = pipeline.vae.decode(latent).sample.detach().cpu().numpy()
    image = ((image + 1) / 2).clip(0, 1).transpose(0, 2, 3, 1)
    image = (image * 255).round().astype("uint8")
    image = [Image.fromarray(i) for i in image]

    input = processor(text=prompt, images=image, return_tensors="pt", padding=True, do_rescale=True).to(device)
    output = model(**input)
    image_embd = output.image_embeds
    text_embd = output.text_embeds
    score = list(torch.nn.functional.cosine_similarity(image_embd, text_embd).detach().cpu().numpy().round(4))
    erased_score.append(score)

    torch.cuda.empty_cache()

#
print(default_score)
print(f"{np.mean(default_score[:10]):.4f}    {np.std(default_score[:10]):.4f}")
print(erased_score)
print(f"{np.mean(erased_score[:10]):.4f}    {np.std(erased_score[:10]):.4f}")

Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

[[0.2349, 0.2348, 0.247, 0.237, 0.2179], [0.266, 0.2795, 0.2411, 0.2601, 0.273], [0.2534, 0.2572, 0.2589, 0.2699, 0.2691], [0.2588, 0.2418, 0.2476, 0.2524, 0.2294], [0.2718, 0.2488, 0.2624, 0.2488, 0.2824], [0.2563, 0.2429, 0.2458, 0.2505, 0.2509], [0.2865, 0.2657, 0.2635, 0.2755, 0.2707], [0.2506, 0.2577, 0.2543, 0.2678, 0.2602], [0.2612, 0.2894, 0.2697, 0.2879, 0.2559], [0.246, 0.27, 0.2508, 0.2438, 0.2633]]
0.2576    0.0151
[[0.2494, 0.2456, 0.2497, 0.2534, 0.2565], [0.2511, 0.2582, 0.2654, 0.279, 0.257], [0.26, 0.2479, 0.2624, 0.2558, 0.2692], [0.2256, 0.2252, 0.24, 0.2385, 0.2555], [0.2568, 0.2738, 0.2551, 0.2685, 0.2812], [0.2439, 0.2356, 0.2206, 0.2399, 0.2248], [0.2558, 0.2603, 0.2677, 0.2651, 0.2685], [0.2504, 0.2544, 0.2434, 0.2488, 0.2471], [0.2933, 0.2675, 0.2706, 0.274, 0.288], [0.2462, 0.2379, 0.2374, 0.2586, 0.2567]]
0.2547    0.0156


In [None]:
import warnings

warnings.filterwarnings("ignore")

import yaml

import numpy as np

import torch
from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
from transformers import CLIPProcessor, CLIPModel

from PIL import Image

torch.set_grad_enabled(False)

#
version = "stabilityai/stable-diffusion-2-1-base"

device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipeline = StableDiffusionPipeline.from_pretrained(version, torch_dtype=torch.float16).to(device)

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

torch.cuda.empty_cache()

#
artist = ["Augustus Whitaker", "Eleanor Pembroke", "Theodore Langford", "Beatrice Sinclair", "Sebastian Hawthorne", 
         "Vivienne Montclair", "Leonard Fairfax", "Cecilia Harrington", "Edmund Ashford", "Isabella Worthington"]
# artist = ["Alaric Thorne", "Beatrice Elmsworth", "Cedric Wycliffe", "Eleanor Ravenscroft", "Fitzgerald Montague", 
#          "Gwendolyn Fairchild", "Horatio Pembroke", "Isolde Ashbourne", "Percival Langley", "Seraphina Blackwood"]

#
sample_count = 5

default_score = []
for a in artist:
    prompt = [f"An image in the style of {a}"] * sample_count

    scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
    scheduler.set_timesteps(100)

    cfg_prompt = prompt + [""] * sample_count
    token = pipeline.tokenizer(cfg_prompt, padding="max_length", max_length=77, truncation=True, return_tensors="pt").input_ids.to(device)
    embd = pipeline.text_encoder(token)[0]

    latent = torch.randn((sample_count, 4, 64, 64), device=device, dtype=torch.float16) * scheduler.init_noise_sigma

    for t in scheduler.timesteps:
        latent_input = torch.cat([latent] * 2)
        latent_input = scheduler.scale_model_input(latent_input, timestep=t)

        noise = pipeline.unet(latent_input, t, encoder_hidden_states=embd).sample
        cond_noise, uncond_noise = noise.chunk(2)
        noise = uncond_noise + 7.5 * (cond_noise - uncond_noise)

        latent = scheduler.step(noise, t, latent).prev_sample

    latent /= 0.18215
    image = pipeline.vae.decode(latent).sample.detach().cpu().numpy()
    image = ((image + 1) / 2).clip(0, 1).transpose(0, 2, 3, 1)
    image = (image * 255).round().astype("uint8")
    image = [Image.fromarray(i) for i in image]

    input = processor(text=prompt, images=image, return_tensors="pt", padding=True, do_rescale=True).to(device)
    output = model(**input)
    image_embd = output.image_embeds
    text_embd = output.text_embeds
    score = list(torch.nn.functional.cosine_similarity(image_embd, text_embd).detach().cpu().numpy().round(4))
    default_score.append(score)

    torch.cuda.empty_cache()

#
pipeline.unet.load_state_dict(torch.load(f"model/erase10.pth"))
torch.cuda.empty_cache()

#
erased_score = []
for a in artist:
    prompt = [f"An image in the style of {a}"] * sample_count

    scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
    scheduler.set_timesteps(100)

    cfg_prompt = prompt + [""] * sample_count
    token = pipeline.tokenizer(cfg_prompt, padding="max_length", max_length=77, truncation=True, return_tensors="pt").input_ids.to(device)
    embd = pipeline.text_encoder(token)[0]

    latent = torch.randn((sample_count, 4, 64, 64), device=device, dtype=torch.float16) * scheduler.init_noise_sigma

    for t in scheduler.timesteps:
        latent_input = torch.cat([latent] * 2)
        latent_input = scheduler.scale_model_input(latent_input, timestep=t)

        noise = pipeline.unet(latent_input, t, encoder_hidden_states=embd).sample
        cond_noise, uncond_noise = noise.chunk(2)
        noise = uncond_noise + 7.5 * (cond_noise - uncond_noise)

        latent = scheduler.step(noise, t, latent).prev_sample

    latent /= 0.18215
    image = pipeline.vae.decode(latent).sample.detach().cpu().numpy()
    image = ((image + 1) / 2).clip(0, 1).transpose(0, 2, 3, 1)
    image = (image * 255).round().astype("uint8")
    image = [Image.fromarray(i) for i in image]

    input = processor(text=prompt, images=image, return_tensors="pt", padding=True, do_rescale=True).to(device)
    output = model(**input)
    image_embd = output.image_embeds
    text_embd = output.text_embeds
    score = list(torch.nn.functional.cosine_similarity(image_embd, text_embd).detach().cpu().numpy().round(4))
    erased_score.append(score)

    torch.cuda.empty_cache()

#
print(default_score)
print(f"{np.mean(default_score[:10]):.4f}    {np.std(default_score[:10]):.4f}")
print(erased_score)
print(f"{np.mean(erased_score[:10]):.4f}    {np.std(erased_score[:10]):.4f}")

In [1]:
import warnings

warnings.filterwarnings("ignore")

import yaml

import numpy as np

import torch
from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
from transformers import CLIPProcessor, CLIPModel

from PIL import Image

torch.set_grad_enabled(False)

#
version = "stabilityai/stable-diffusion-2-1-base"

device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipeline = StableDiffusionPipeline.from_pretrained(version, torch_dtype=torch.float16).to(device)

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

torch.cuda.empty_cache()

#
with open(f"data/valid_artist.yaml", 'r', encoding='utf-8') as file:
    artist = yaml.safe_load(file)[:20]

#
sample_count = 5

default_score = []
for a in artist:
    prompt = [f"An image in the style of {a}"] * sample_count

    scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
    scheduler.set_timesteps(100)

    cfg_prompt = prompt + [""] * sample_count
    token = pipeline.tokenizer(cfg_prompt, padding="max_length", max_length=77, truncation=True, return_tensors="pt").input_ids
    for t in token: t[7: list(t.numpy()).index(49407) - 1] = 0
    token = token.to(device)
    embd = pipeline.text_encoder(token)[0]

    latent = torch.randn((sample_count, 4, 64, 64), device=device, dtype=torch.float16) * scheduler.init_noise_sigma

    for t in scheduler.timesteps:
        latent_input = torch.cat([latent] * 2)
        latent_input = scheduler.scale_model_input(latent_input, timestep=t)

        noise = pipeline.unet(latent_input, t, encoder_hidden_states=embd).sample
        cond_noise, uncond_noise = noise.chunk(2)
        noise = uncond_noise + 7.5 * (cond_noise - uncond_noise)

        latent = scheduler.step(noise, t, latent).prev_sample

    latent /= 0.18215
    image = pipeline.vae.decode(latent).sample.detach().cpu().numpy()
    image = ((image + 1) / 2).clip(0, 1).transpose(0, 2, 3, 1)
    image = (image * 255).round().astype("uint8")
    image = [Image.fromarray(i) for i in image]

    input = processor(text=prompt, images=image, return_tensors="pt", padding=True, do_rescale=True).to(device)
    output = model(**input)
    image_embd = output.image_embeds
    text_embd = output.text_embeds
    score = list(torch.nn.functional.cosine_similarity(image_embd, text_embd).detach().cpu().numpy().round(4))
    default_score.append(score)

    torch.cuda.empty_cache()

#
pipeline.unet.load_state_dict(torch.load(f"model/erase10.pth"))
torch.cuda.empty_cache()

#
erased_score = []
for a in artist:
    prompt = [f"An image in the style of {a}"] * sample_count

    scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
    scheduler.set_timesteps(100)

    cfg_prompt = prompt + [""] * sample_count
    token = pipeline.tokenizer(cfg_prompt, padding="max_length", max_length=77, truncation=True, return_tensors="pt").input_ids
    for t in token: t[7: list(t.numpy()).index(49407) - 1] = 0
    token = token.to(device)
    embd = pipeline.text_encoder(token)[0]

    latent = torch.randn((sample_count, 4, 64, 64), device=device, dtype=torch.float16) * scheduler.init_noise_sigma

    for t in scheduler.timesteps:
        latent_input = torch.cat([latent] * 2)
        latent_input = scheduler.scale_model_input(latent_input, timestep=t)

        noise = pipeline.unet(latent_input, t, encoder_hidden_states=embd).sample
        cond_noise, uncond_noise = noise.chunk(2)
        noise = uncond_noise + 7.5 * (cond_noise - uncond_noise)

        latent = scheduler.step(noise, t, latent).prev_sample

    latent /= 0.18215
    image = pipeline.vae.decode(latent).sample.detach().cpu().numpy()
    image = ((image + 1) / 2).clip(0, 1).transpose(0, 2, 3, 1)
    image = (image * 255).round().astype("uint8")
    image = [Image.fromarray(i) for i in image]

    input = processor(text=prompt, images=image, return_tensors="pt", padding=True, do_rescale=True).to(device)
    output = model(**input)
    image_embd = output.image_embeds
    text_embd = output.text_embeds
    score = list(torch.nn.functional.cosine_similarity(image_embd, text_embd).detach().cpu().numpy().round(4))
    erased_score.append(score)

    torch.cuda.empty_cache()

#
print(default_score)
print(f"{np.mean(default_score[:10]):.4f}    {np.std(default_score[:10]):.4f}")
print(f"{np.mean(default_score[10:]):.4f}    {np.std(default_score[10:]):.4f}")
print(erased_score)
print(f"{np.mean(erased_score[:10]):.4f}    {np.std(erased_score[:10]):.4f}")
print(f"{np.mean(erased_score[10:]):.4f}    {np.std(erased_score[10:]):.4f}")

Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

[[0.2403, 0.238, 0.2142, 0.2505, 0.2324], [0.3182, 0.2889, 0.3309, 0.3062, 0.3041], [0.1673, 0.1492, 0.1391, 0.1472, 0.1778], [0.1401, 0.1798, 0.1459, 0.1411, 0.1479], [0.1613, 0.1328, 0.1312, 0.1263, 0.1515], [0.2122, 0.1966, 0.1824, 0.2104, 0.2067], [0.1966, 0.206, 0.2071, 0.2485, 0.2036], [0.289, 0.2721, 0.2724, 0.2984, 0.3256], [0.2325, 0.2176, 0.1817, 0.2285, 0.2037], [0.2448, 0.2584, 0.2646, 0.2385, 0.2291], [0.0715, 0.0741, 0.1111, 0.0483, 0.1058], [0.2927, 0.3145, 0.2716, 0.3167, 0.2559], [0.2357, 0.2326, 0.1875, 0.1858, 0.1907], [0.1627, 0.1814, 0.1924, 0.1861, 0.1666], [0.2328, 0.2259, 0.2333, 0.2299, 0.2177], [0.1663, 0.2252, 0.179, 0.1751, 0.1492], [0.1559, 0.1367, 0.1274, 0.1526, 0.1266], [0.1439, 0.1972, 0.1601, 0.1799, 0.1951], [0.23, 0.2646, 0.2088, 0.2896, 0.2438], [0.1884, 0.2014, 0.1664, 0.1627, 0.2193]]
0.2158    0.0564
0.1914    0.0582
[[0.2462, 0.2604, 0.2428, 0.2222, 0.2172], [0.1997, 0.2053, 0.2404, 0.1879, 0.2027], [0.1363, 0.1571, 0.1485, 0.1444, 0.1627], [0.1