<a href="https://colab.research.google.com/github/vannoordenne/static./blob/main/webcam2interrogator2images.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This Colab contains the main code that was used for the interactive art installation (static.) as a case study for the MSc thesis: Cracking the Code: Interactive Installation Art to increase Non-Expert User Engagement with AI Systems by Marise van Noordenne.

This program is a combined adaptation of:

Image 2 prompt generation by CLIP Interrogator: 
https://colab.research.google.com/github/pharmapsychotic/clip-interrogator/blob/main/clip_interrogator.ipynb

Prompt 2 image generation by Stable Diffusion: 
https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_diffusion.ipynb. 

Displaying the generation process by Edan Myer: 
https://colab.research.google.com/drive/1_kbRZPTjnFgViPrmGcUsaszEdYa8XTpq?usp=sharing

In [None]:
#@title setup
!nvidia-smi
!pip install diffusers
!pip install transformers scipy ftfy accelerate
!pip install "ipywidgets>=7,<8"
!pip install gradio
!pip install open_clip_torch
!pip install clip-interrogator

from PIL import Image as Img
from IPython.display import display, Javascript, Image, HTML, clear_output
from google.colab.output import eval_js
from base64 import b64decode, b64encode

import csv
import subprocess
import os
import cv2
import numpy as np
import torch
torch_device = "cuda" if torch.cuda.is_available() else "cpu"
torch.cuda.empty_cache()

import gradio as gr
from clip_interrogator import Config, Interrogator

from transformers import CLIPTextModel, CLIPTokenizer
from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler, LMSDiscreteScheduler
from diffusers.schedulers.scheduling_ddim import DDIMScheduler
from tqdm import tqdm
from tqdm.auto import tqdm
from torch import autocast
from torch.nn import functional as F
from PIL import Image, ImageDraw
from huggingface_hub import notebook_login
from pathlib import Path

caption_model_name = 'blip-large' #@param ["blip-base", "blip-large", "git-large-coco"]
clip_model_name = 'ViT-L-14/openai' #@param ["ViT-L-14/openai", "ViT-H-14/laion2b_s32b_b79k"]

config = Config()
config.clip_model_name = clip_model_name
config.caption_model_name = caption_model_name
ci = Interrogator(config)

auth_token="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"

path = r"/content/drive/MyDrive/Colab/imgs"
os.chdir(path)

file_path = f"/content/drive/MyDrive/Colab/imgs/image.png"

# 1. Load the autoencoder model which will be used to decode the latents into image space. 
vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae", use_auth_token=auth_token)
vae = vae.to(torch_device)

# 2. Load the tokenizer and text encoder to tokenize and encode the text. 
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")
text_encoder = text_encoder.to(torch_device)

# 3. The UNet model for generating the latents.
unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet", use_auth_token=auth_token)
unet = unet.to(torch_device) 

scheduler = DDIMScheduler(
    beta_start=0.00085, beta_end=0.012,
    beta_schedule='scaled_linear', num_train_timesteps=1000)

In [None]:
#@title prompt 2 img

def get_text_embeds(prompt):
  # Tokenize text and get embeddings
  text_input = tokenizer(
      prompt, padding='max_length', max_length=tokenizer.model_max_length,
      truncation=True, return_tensors='pt')
  with torch.no_grad():
    text_embeddings = text_encoder(text_input.input_ids.to(torch_device))[0]

  # Do the same for unconditional embeddings
  uncond_input = tokenizer(
      [''] * len(prompt), padding='max_length',
      max_length=tokenizer.model_max_length, return_tensors='pt')
  with torch.no_grad():
    uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0]

  # Cat for final embeddings
  text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
  return text_embeddings

# test_embeds = get_text_embeds(['portrait photo, Canon 60D, 50mm'])
# print(test_embeds)
# print(test_embeds.shape)


def produce_latents(text_embeddings, height=512, width=512,
                    num_inference_steps=50, guidance_scale=7.5, latents=None,
                    return_all_latents=False, start_step=10):
  if latents is None:
    latents = torch.randn((text_embeddings.shape[0] // 2, unet.in_channels, \
                           height // 8, width // 8))
  latents = latents.to(torch_device)

  scheduler.set_timesteps(num_inference_steps)
  # latents = latents * scheduler.sigmas[0]

  if start_step > 0:
    start_timestep = scheduler.timesteps[start_step]
    start_timesteps = start_timestep.repeat(latents.shape[0]).long()

    noise = torch.randn_like(latents)
    latents = scheduler.add_noise(latents, noise, start_timesteps)

  latent_list = [latents]

  with autocast('cuda'):
    for i, t in tqdm(enumerate(scheduler.timesteps[start_step:])):
      # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
      latent_model_input = torch.cat([latents] * 2)
      # sigma = scheduler.sigmas[i]
      # latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)

      # predict the noise residual
      with torch.no_grad():
        noise_pred = unet(latent_model_input, t, encoder_hidden_states=text_embeddings)['sample']

      # perform guidance
      noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
      noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

      # compute the previous noisy sample x_t -> x_t-1
      latents = scheduler.step(noise_pred, t, latents)['prev_sample']
      latent_list.append(latents)

    if not return_all_latents:
      return latents

    all_latents = torch.cat(latent_list, dim=0)
    return all_latents


# test_latents = produce_latents(test_embeds)
# print(test_latents)
# print(test_latents.shape)


def decode_img_latents(latents):
  latents = 1 / 0.18215 * latents

  with torch.no_grad():
    imgs = vae.decode(latents)

  imgs = (imgs.sample / 2 + 0.5).clamp(0, 1)
  imgs = imgs.detach().cpu().permute(0, 2, 3, 1).numpy()
  imgs = (imgs * 255).round().astype('uint8')
  pil_images = [Image.fromarray(image) for image in imgs]
  return pil_images

# imgs = decode_img_latents(test_latents)
# imgs[0]

def encode_img_latents(imgs):
  if not isinstance(imgs, list):
    imgs = [imgs]

  img_arr = np.stack([np.array(img) for img in imgs], axis=0)
  img_arr = img_arr / 255.0
  img_arr = torch.from_numpy(img_arr).float().permute(0, 3, 1, 2)
  img_arr = 2 * (img_arr - 0.5)

  latent_dists = vae.encode(img_arr.to(torch_device))[0]
  latent_samples = latent_dists.sample()
  latent_samples *= 0.18215

  return latent_samples

def prompt_to_img(prompts, height, width, num_inference_steps=50,
                  guidance_scale=7.5, latents=None, return_all_latents=False, batch_size=1, start_step=0, folder=0):
  if isinstance(prompts, str):
    prompts = [prompts]

  # Prompts -> text embeds
  text_embeds = get_text_embeds(prompts)

  # Text embeds -> img latents
  latents = produce_latents(
      text_embeds, height=height, width=width, latents=latents,
      num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, return_all_latents=return_all_latents, start_step=start_step)
  
  # Img latents -> imgs
  all_imgs = []
  for i in tqdm(range(0, len(latents), batch_size)):
    imgs = decode_img_latents(latents[i:i+batch_size])
    all_imgs.extend(imgs)
    all_imgs[i].save("/content/drive/MyDrive/Colab/imgs/"+str(folder)+"/image"+str(i)+".png")

  return all_imgs

def image_to_prompt(image, mode):
    ci.config.chunk_size = 2048 if ci.config.clip_model_name == "ViT-L-14/openai" else 1024
    ci.config.flavor_intermediate_count = 2048 if ci.config.clip_model_name == "ViT-L-14/openai" else 1024
    image = image.convert('RGB')
    if mode == 'best':
        return ci.interrogate(image)
    elif mode == 'classic':
        return ci.interrogate_classic(image)
    elif mode == 'fast':
        return ci.interrogate_fast(image)
    elif mode == 'negative':
        return ci.interrogate_negative(image)

In [None]:
#@title webcam 2 prompt
while True:
  folder_path = "/content/drive/MyDrive/Colab/imgs" 
  prompt_mode = 'fast' #@param ["best","fast","classic","negative"]
  output_mode = 'desc.csv' 
  max_filename_len = 128 #@param {type:"integer"}

  def sanitize_for_filename(prompt: str, max_len: int) -> str:
      name = "".join(c for c in prompt if (c.isalnum() or c in ",._-! "))
      name = name.strip()[:(max_len-4)] # extra space for extension
      return name

  def image_to_prompt(image, mode):
      ci.config.chunk_size = 2048 if ci.config.clip_model_name == "ViT-L-14/openai" else 1024
      ci.config.flavor_intermediate_count = 2048 if ci.config.clip_model_name == "ViT-L-14/openai" else 1024
      image = image.convert('RGB')
      if mode == 'best':
          return ci.interrogate(image)
      elif mode == 'classic':
          return ci.interrogate_classic(image)
      elif mode == 'fast':
          return ci.interrogate_fast(image)
      elif mode == 'negative':
          return ci.interrogate_negative(image)

  ci.config.quiet = True

  files = [f for f in os.listdir(folder_path) if f == 'webcam.png'] if os.path.exists(folder_path) else []
  prompts = []
  for idx, file in enumerate(tqdm(files, desc='Generating prompts')):
      if idx > 0 and idx % 100 == 0:
          clear_output(wait=True)

      image = Image.open(os.path.join(folder_path, file)).convert('RGB')
      prompt = image_to_prompt(image, prompt_mode)
      prompts.append(prompt)

      print(prompt)
      thumb = image.copy()
      thumb.thumbnail([256, 256])
      display(thumb)

  if len(prompts):
      if output_mode == 'desc.csv':
          csv_path = os.path.join(folder_path, 'desc.csv')
          with open(csv_path, 'w', encoding='utf-8', newline='') as f:
              w = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
              w.writerow(['image', 'prompt'])
              for file, prompt in zip(files, prompts):
                  w.writerow([file, prompt])
          print(f"\n\n\n\nGenerated {len(prompts)} prompts and saved to {csv_path}, enjoy!")
          
          txt_path = os.path.join("/content/drive/MyDrive/Colab", 'prompt.txt')
          with open(txt_path, 'w', encoding='utf-8') as f:
            f.write(prompts[0])

          print(prompts[0])
      else:
          print(f"\n\n\n\nGenerated {len(prompts)} prompts and renamed your files, enjoy!")
  else:
      print(f"Sorry, I couldn't find any images in {folder_path}")


  folder_path = "/content/drive/MyDrive/Colab/imgs"

  height = 600 #@param {type:"integer"}
  width = 800 #@param {type:"integer"}
  num_inference_steps = 30 #@param {type:"integer"}
  guidance_scale = 7 #@param {type:"integer"}

  n_images = 5 #@param {type:"integer"}

  for i in range(n_images):
    images = prompt_to_img(prompts[0], height=height, width=width, num_inference_steps=num_inference_steps, guidance_scale=guidance_scale, return_all_latents=True, folder=i)