## Use this notebook to learn about Stable Diffusion or Latent Diffusion Models

In [1]:
## The necessary imports
import util
import numpy as np
from PIL import Image
#from google.colab import files
# from stable_diffusion import util
# from stable_diffusion import pipeline
# from stable_diffusion import model_loader
# models = model_loader.preload_models('cpu') # Assume the device is the CPU

### The Inference step

In [None]:
def generate(prompts,
            unconditional_prompts = None,
            input_images = None,
            strength = 0.8,
            do_cfg = True,
            cfg_scale = 7.5,
            height = 512,
            width = 512,
            sampler = "k_lms",
            n_inference_steps = 5,
            models = None,
            seed = None,
            device = None,
            idle_device = None):
    if models is None:
        models = {}
    with torch.no_grad():
        if not prompts or not isinstance(prompts, (list, tuple)):
            raise ValueError("prompts must be a list or tuple")
        if unconditional_prompts and not isinstance(unconditional_prompts, (list, tuple)):
            raise ValueError("unconditional prompts must be non-empty")
        # Ensure the length of unconditional prompts is the same as the input prompts
        unconditional_prompts = unconditional_prompts or [""] * len(prompts)
        
        # Check for input images
        if input_images and not isinstance(unconditional_prompts, (list, tuple)):
            raise ValueError(f"{input_images} must be a non-empty list or tuple if provided")
        if input_images and len(prompts) != len(input_images):
            raise ValueError(f"Length of input_images {input_images} must be the same as the length of prompts {prompts}")
        if not 0 <= strength <= 1:
            raise ValueError(f"{strength} must be between 0 and 1")
        
        # Check for the height and width dimensions
        if height % 8 or width % 8:
            raise ValueError("Height and Width must be multiples of 8")
            
        # Check for the availability of device
        if device is None:
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        if idle_device:
            to_idle = lambda x: x.to(idle_device)
        else:
            to_idle = lambda x: x
        
        # generator
        generator = torch.generator(device=device)
        if seed is None:
            generator.seed()
        else:
            generator.manual_seed(seed)
        
        # Text tokenizer
        tokenizer = Tokenizer()
        clip = models.get("clip") or model_loader.clip(device)
        clip.to(device)
        
        # Use the datatype (dtype) of the model weights as the default dtype
        dtype = clip.embedding.position_value.dtype
        if do_cfg:
            conditional_tokens = tokenizer.encode_batch(prompts)
            conditional_tokens = torch.tensor(conditional_tokens, dtype=torch.long, device=device)
            conditional_context = clip(conditional_tokens)
            unconditional_tokens = tokenizer.encode_batch(unconditional_prompts)
            unconditional_tokens = torch.tensor(unconditional_tokens, dtype=torch.long, device=device)
            unconditional_context = clip(unconditional_tokens)
            context = torch.cat([conditional_context, unconditional_context])
        else:
            tokens = tokenizer.encode_batch(prompts)
            tokens = torch.tensor(tokens, dtype=torch.long, device=device)
            context = clip(tokens)
        to_idle(clip)
        del tokenizer, clip
        
        # Obtain different samplers based on the input requirements
        if sampler == "k_lms":
            sampler = KLMSSampler(n_inference_steps=n_inference_steps)
        elif sampler == "k_euler":
            sampler = KEulerSampler(n_inference_steps=n_inference_steps)
        elif sampler == "k_euler_ancestral":
            sampler = KEulerAncestralSampler(n_inference_steps=n_inference_steps)

        else:
            raise ValueError("Unknown sampler value")

        noise_shape = (len(prompts), 4, height//8, width//8)

In [None]:
prompt = ["a photograph of an astronaut riding a horse"] #@param {type: "string"}
prompts = [prompt]

unconditional_prompt = "" #@param {type: "string"}
unconditional_prompts = [unconditional_prompt] if unconditional_prompt else None

upload_input_image = False #@param {type: "boolean"}
input_images = None
if upload_input_image:
    print("Upload an input image")
    path = list(files.upload().keys())[0]
    input_images = [Image.open(path)]

strength = 0.8 #@param {type: "slider", min:0, max:1, step:0.01}
do_cfg = True #@param {type: "boolean"}
height = 512 #@param {type: "integer"}
width = 512 #@param {type: "integer"}
sampler = "k_lms" #@param ["k_lms", "k_euler", "k_euler_ancestral"]
n_inference_steps = 50 #@param {type: "integer"}

use_seed = False
if use_seed:
    seed = 42
else:
    seed = None

## Generate the image based on the set of prompts given above
images = generate(prompts=prompts, unconditional_prompts=unconditional_prompts,
                  input_images=input_images, strength=strength,
                  do_cfg=do_cfg, cfg_scale=cfg_scale,
                  height=height, width=width, sampler=sampler,
                  n_inference_steps=n_inference_steps, seed=seed,
                  models=models, device='cpu', idle_device='cpu')[0]

### Sampler and Timesteps
- A Sampler is used to generate noise for the decoding process and added to the latents.
- There are different types of samplers based on performance criteria such as **speed**, **accuracy** and **image quality**.
- Taking the inputs **n_inference_steps** and **n_training_steps**, create a timestep as **(n_training_steps//n_inference_steps)**.
- For each of the above timesteps, obtain an **embedding** of a specified length

In [None]:
class KEulerSampler():
	def __init__(self, n_inference_steps = 50, n_training_steps = 1000):
		timesteps = np.linspace(n_training_steps - 1, 0, n_inference_steps)
		alphas_cumprod = util.get_alphas_cumprod(n_training_steps = n_training_steps)
		sigmas = (1 - alphas_cumprod) / (alphas_cumprod) ** 0.5
		log_sigmas = np.log(sigmas)
		log_sigmas = np.interp(timesteps, range(n_training_steps), log_sigmas)
		sigmas = np.exp(log_sigmas)
		sigmas = np.append(sigmas, 0)

		self.sigmas = sigmas
		self.initial_scale = sigmas.max()
		self.timesteps = timesteps
		self.n_inference_steps = n_inference_steps
		self.n_training_steps = n_training_steps
		self.step_count = 0

	def get_input_scale(self, step_count = None):
		if step_count is None:
			step_count = self.step_count
		sigma = self.sigmas[step_count]
		return 1 / (sigma ** 2 + 1) ** 0.5

	def set_strength(self, strength = 1):
		start_step = self.n_inference_steps - int(self.n_inference_steps * strength)
		self.timesteps = np.linspace(self.n_training_steps - 1, 0, self.n_inference_steps)
		self.timesteps = self.timesteps[start_step:]
		self.initial_scale = self.sigmas[start_step]
		self.step_count = start_step

	def step(self, latents, output):
		t = self.step_count
		self.step_count += 1
		sigma_from = self.sigmas[t]
		sigma_to = self.sigmas[t + 1]
		latents += output * (sigma_to - sigma_from)
		return latents