In [None]:
import os
import random
import warnings

warnings.filterwarnings("ignore")

import yaml

import pandas as pd
import numpy as np

from tqdm.auto import tqdm

import torch
from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
from transformers import CLIPModel, CLIPProcessor

from PIL import Image


#
torch.set_grad_enabled(False)

device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipeline = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base", torch_dtype=torch.float16).to(device)

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

torch.cuda.empty_cache()


#
prompt = ["An image in the style of Pablo Picasso"] * 5 + ["Pablo Picasso"] * 5

prompt_count = len(prompt)
seed = 5

scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
scheduler.set_timesteps(100)

prompt = prompt + [""] * prompt_count

token = pipeline.tokenizer(prompt, padding="max_length", max_length=77, truncation=True, return_tensors="pt").input_ids.to(device)
embd = pipeline.text_encoder(token)[0]

generator = torch.Generator(device=device).manual_seed(seed)
# latent = torch.randn((1, 4, 64, 64), generator=generator, device=device, dtype=torch.float16)
# latent = torch.cat([latent] * prompt_count) * scheduler.init_noise_sigma
latent = torch.randn((prompt_count, 4, 64, 64), generator=generator, device=device, dtype=torch.float16) * scheduler.init_noise_sigma

for t in scheduler.timesteps:

    latent_input = torch.cat([latent] * 2)
    latent_input = scheduler.scale_model_input(latent_input, timestep=t)

    noise = pipeline.unet(latent_input, t, encoder_hidden_states=embd).sample
    cond_noise, uncond_noise = noise.chunk(2)
    noise = uncond_noise + 7.5 * (cond_noise - uncond_noise)

    latent = scheduler.step(noise, t, latent).prev_sample

latent /= 0.18215
image = pipeline.vae.decode(latent).sample.detach().cpu().numpy()
image = ((image + 1) / 2).clip(0, 1).transpose(0, 2, 3, 1)
image = (image * 255).round().astype("uint8")
image = [Image.fromarray(i) for i in image]

torch.cuda.empty_cache()

print(artist)
for i in image:    
    display(i.resize((64, 64)))


#
prompt = ["Pablo Picasso", "An image in the style of Pablo Picasso", "A painting of an artist", "", ""] * 2

input = processor(text=prompt, images=image, return_tensors="pt", padding=True, do_rescale=True)
output = model(**input)
image_embd = output.image_embeds
text_embd = output.text_embeds

score = torch.nn.functional.cosine_similarity(image_embd, text_embd).numpy().round(3)

display(pd.DataFrame({"prompt":prompt, "score":score}))