In [1]:
import os
import random
import warnings

warnings.filterwarnings("ignore")

import lpips

from tqdm.auto import tqdm

import numpy as np
import pandas as pd

import torch
from diffusers import StableDiffusionPipeline, LMSDiscreteScheduler
from transformers import CLIPProcessor, CLIPModel
from torchvision import transforms

from PIL import Image


df = pd.read_csv("../data/artists1734_prompts.csv")

artists = list(df.artist.unique())
random.shuffle(artists)

prompts = artists[:20]
seeds = [random.randint(0, 5000) for _ in prompts]

prev_prompts = prompts[:10]
new_prompts = ["art"] * 10
retain_prompts = artists[10:]

In [5]:
def delete_pipeline(pipeline):
    del pipeline.vae
    del pipeline.tokenizer
    del pipeline.text_encoder
    del pipeline.unet
    del pipeline
    torch.cuda.empty_cache()

def preprocess_images(images):
    preprocess = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])

    images_ = []
    for image in images:
        image = preprocess(image)
        images_.append(image)
    images_ = torch.stack(images_)

    return images_

@torch.no_grad()
def compare_images(images1, images2, prompts):

    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

    images1 = preprocess_images(images1)
    images2 = preprocess_images(images2)

    inputs = processor(text=prompts, images=images1, return_tensors="pt", padding=True, do_rescale=False)
    outputs = model(**inputs)
    image_embds1 = outputs.image_embeds

    inputs = processor(text=prompts, images=images2, return_tensors="pt", padding=True, do_rescale=False)
    outputs = model(**inputs)
    image_embds2 = outputs.image_embeds

    text_embds = outputs.text_embeds

    clip_score1 = torch.nn.functional.cosine_similarity(image_embds1, text_embds).numpy().round(3)
    clip_score2 = torch.nn.functional.cosine_similarity(image_embds2, text_embds).numpy().round(3)

    loss_function = lpips.LPIPS(net='alex')
    images1 = images1 * 2 - 1
    images2 = images2 * 2 - 1

    return pd.DataFrame({"CLIP 1": clip_score1, "CLIP 2": clip_score2,
                         "CLIP diff": clip_score1 - clip_score2,
                         "LPIPS diff": loss_function(images1, images2).squeeze().detach().numpy().round(3)})

@torch.no_grad()
def generate_images(pipeline, prompts, seeds, step_count=100):

    device = pipeline.device
    images = []
    for idx, prompt in enumerate(tqdm(prompts)):

        scheduler = LMSDiscreteScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", num_train_timesteps=1000)
        scheduler.set_timesteps(step_count)

        prompt = [prompt] + [""]
        
        token = pipeline.tokenizer(
            prompt,
            padding="max_length",
            max_length=77,
            truncation=True,
            return_tensors="pt"
        ).input_ids.to(device)
        
        embd = pipeline.text_encoder(token)[0]
        
        generator = torch.Generator(device=device).manual_seed(seeds[idx])
        
        latent = torch.randn((1, 4, 64, 64), generator=generator, device=device, dtype=torch.float16)
        latent *= scheduler.init_noise_sigma
        
        for t in scheduler.timesteps:
            latent_input = torch.cat([latent] * 2)
            latent_input = scheduler.scale_model_input(latent_input, timestep=t)
            
            noise = pipeline.unet(latent_input, t, encoder_hidden_states=embd).sample
            cond_noise, uncond_noise = noise.chunk(2)
            noise = uncond_noise + 7.5 * (cond_noise - uncond_noise)
            
            latent = scheduler.step(noise, t, latent).prev_sample
        
        image = pipeline.vae.decode(latent).sample
        image = ((image + 1) / 2).clamp(0, 1).permute(0, 2, 3, 1)
        image = image.detach().cpu().numpy()
        image = (image * 255).round().astype("uint8")
        image = Image.fromarray(image[0])
        images.append(image)
    
    return images


In [3]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
pipeline = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1-base", torch_dtype=torch.float16).to(device)

step100_images = generate_images(pipeline, prompts, seeds)
step50_images = generate_images(pipeline, prompts, seeds, 50)

Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

In [7]:
compare_images(step100_images, step50_images, prompts)

Setting up [LPIPS] perceptual loss: trunk [alex], v[0.1], spatial [off]
Loading model from: c:\Users\yoonj\AppData\Local\Programs\Python\Python312\Lib\site-packages\lpips\weights\v0.1\alex.pth


Unnamed: 0,CLIP 1,CLIP 2,CLIP diff,LPIPS diff
0,0.266,0.267,-0.001,0.003
1,0.317,0.319,-0.002,0.005
2,0.228,0.23,-0.002,0.001
3,0.125,0.127,-0.002,0.002
4,0.288,0.289,-0.001,0.002
5,0.19,0.211,-0.021,0.063
6,0.26,0.267,-0.007,0.05
7,0.246,0.244,0.002,0.012
8,0.278,0.282,-0.004,0.002
9,0.212,0.209,0.003,0.002
