Standardize on using image argument in all pipelines (huggingface#1361

) * feat: switch core pipelines to use image arg * test: update tests for core pipelines * feat: switch examples to use image arg * docs: update docs to use image arg * style: format code using black and doc-builder * fix: deprecate use of init_image in all pipelines
tcapelle · Dec 12, 2022 · 210bab3 · 210bab3
1 parent 520281a
commit 210bab3
Show file tree

Hide file tree

Showing 27 changed files with 264 additions and 223 deletions.
diff --git a/README.md b/README.md
@@ -280,7 +280,7 @@ init_image = init_image.resize((768, 512))
 
 prompt = "A fantasy landscape, trending on artstation"
 
-images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
+images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
 
 images[0].save("fantasy_landscape.png")
 ```

diff --git a/docs/source/api/pipelines/cycle_diffusion.mdx b/docs/source/api/pipelines/cycle_diffusion.mdx
@@ -57,7 +57,7 @@ prompt = "An astronaut riding an elephant"
 image = pipe(
     prompt=prompt,
     source_prompt=source_prompt,
-    init_image=init_image,
+    image=init_image,
     num_inference_steps=100,
     eta=0.1,
     strength=0.8,
@@ -83,7 +83,7 @@ torch.manual_seed(0)
 image = pipe(
     prompt=prompt,
     source_prompt=source_prompt,
-    init_image=init_image,
+    image=init_image,
     num_inference_steps=100,
     eta=0.1,
     strength=0.85,

diff --git a/docs/source/api/pipelines/overview.mdx b/docs/source/api/pipelines/overview.mdx
@@ -149,7 +149,7 @@ init_image = init_image.resize((768, 512))
 
 prompt = "A fantasy landscape, trending on artstation"
 
-images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
+images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
 
 images[0].save("fantasy_landscape.png")
 ```

diff --git a/docs/source/using-diffusers/custom_pipeline_examples.mdx b/docs/source/using-diffusers/custom_pipeline_examples.mdx
@@ -177,7 +177,7 @@ init_image = download_image(
 
 prompt = "A fantasy landscape, trending on artstation"
 
-images = pipe.img2img(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
+images = pipe.img2img(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
 
 ### Inpainting
 
@@ -187,7 +187,7 @@ init_image = download_image(img_url).resize((512, 512))
 mask_image = download_image(mask_url).resize((512, 512))
 
 prompt = "a cat sitting on a bench"
-images = pipe.inpaint(prompt=prompt, init_image=init_image, mask_image=mask_image, strength=0.75).images
+images = pipe.inpaint(prompt=prompt, image=init_image, mask_image=mask_image, strength=0.75).images
 ```
 
 As shown above this one pipeline can run all both "text-to-image", "image-to-image", and "inpainting" in one pipeline.

diff --git a/docs/source/using-diffusers/img2img.mdx b/docs/source/using-diffusers/img2img.mdx
@@ -37,7 +37,7 @@ init_image.thumbnail((768, 768))
 
 prompt = "A fantasy landscape, trending on artstation"
 
-images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
+images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
 
 images[0].save("fantasy_landscape.png")
 ```

diff --git a/examples/community/README.md b/examples/community/README.md
@@ -166,7 +166,7 @@ init_image = download_image("https://raw.githubusercontent.com/CompVis/stable-di
 
 prompt = "A fantasy landscape, trending on artstation"
 
-images = pipe.img2img(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
+images = pipe.img2img(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
 
 ### Inpainting
 
@@ -176,7 +176,7 @@ init_image = download_image(img_url).resize((512, 512))
 mask_image = download_image(mask_url).resize((512, 512))
 
 prompt = "a cat sitting on a bench"
-images = pipe.inpaint(prompt=prompt, init_image=init_image, mask_image=mask_image, strength=0.75).images
+images = pipe.inpaint(prompt=prompt, image=init_image, mask_image=mask_image, strength=0.75).images
 ```
 
 As shown above this one pipeline can run all both "text-to-image", "image-to-image", and "inpainting" in one pipeline.
@@ -420,7 +420,7 @@ init_image = Image.open(BytesIO(response.content)).convert("RGB")
 init_image = init_image.resize((512, 512))
 res = pipe.train(
     prompt,
-    init_image,
+    image=init_image,
     guidance_scale=7.5,
     num_inference_steps=50,
     generator=generator)

diff --git a/examples/community/imagic_stable_diffusion.py b/examples/community/imagic_stable_diffusion.py
@@ -17,7 +17,7 @@
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
 from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
-from diffusers.utils import logging
+from diffusers.utils import deprecate, logging
 
 # TODO: remove and import from diffusers.utils when the new version of diffusers is released
 from packaging import version
@@ -133,7 +133,7 @@ def disable_attention_slicing(self):
     def train(
         self,
         prompt: Union[str, List[str]],
-        init_image: Union[torch.FloatTensor, PIL.Image.Image],
+        image: Union[torch.FloatTensor, PIL.Image.Image],
         height: Optional[int] = 512,
         width: Optional[int] = 512,
         generator: Optional[torch.Generator] = None,
@@ -184,6 +184,10 @@ def train(
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content, according to the `safety_checker`.
         """
+        message = "Please use `image` instead of `init_image`."
+        init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
+        image = init_image or image
+
         accelerator = Accelerator(
             gradient_accumulation_steps=1,
             mixed_precision="fp16",
@@ -241,14 +245,14 @@ def train(
             lr=embedding_learning_rate,
         )
 
-        if isinstance(init_image, PIL.Image.Image):
-            init_image = preprocess(init_image)
+        if isinstance(image, PIL.Image.Image):
+            image = preprocess(image)
 
         latents_dtype = text_embeddings.dtype
-        init_image = init_image.to(device=self.device, dtype=latents_dtype)
-        init_latent_image_dist = self.vae.encode(init_image).latent_dist
-        init_image_latents = init_latent_image_dist.sample(generator=generator)
-        init_image_latents = 0.18215 * init_image_latents
+        image = image.to(device=self.device, dtype=latents_dtype)
+        init_latent_image_dist = self.vae.encode(image).latent_dist
+        image_latents = init_latent_image_dist.sample(generator=generator)
+        image_latents = 0.18215 * image_latents
 
         progress_bar = tqdm(range(text_embedding_optimization_steps), disable=not accelerator.is_local_main_process)
         progress_bar.set_description("Steps")
@@ -259,12 +263,12 @@ def train(
         for _ in range(text_embedding_optimization_steps):
             with accelerator.accumulate(text_embeddings):
                 # Sample noise that we'll add to the latents
-                noise = torch.randn(init_image_latents.shape).to(init_image_latents.device)
-                timesteps = torch.randint(1000, (1,), device=init_image_latents.device)
+                noise = torch.randn(image_latents.shape).to(image_latents.device)
+                timesteps = torch.randint(1000, (1,), device=image_latents.device)
 
                 # Add noise to the latents according to the noise magnitude at each timestep
                 # (this is the forward diffusion process)
-                noisy_latents = self.scheduler.add_noise(init_image_latents, noise, timesteps)
+                noisy_latents = self.scheduler.add_noise(image_latents, noise, timesteps)
 
                 # Predict the noise residual
                 noise_pred = self.unet(noisy_latents, timesteps, text_embeddings).sample
@@ -301,12 +305,12 @@ def train(
         for _ in range(model_fine_tuning_optimization_steps):
             with accelerator.accumulate(self.unet.parameters()):
                 # Sample noise that we'll add to the latents
-                noise = torch.randn(init_image_latents.shape).to(init_image_latents.device)
-                timesteps = torch.randint(1000, (1,), device=init_image_latents.device)
+                noise = torch.randn(image_latents.shape).to(image_latents.device)
+                timesteps = torch.randint(1000, (1,), device=image_latents.device)
 
                 # Add noise to the latents according to the noise magnitude at each timestep
                 # (this is the forward diffusion process)
-                noisy_latents = self.scheduler.add_noise(init_image_latents, noise, timesteps)
+                noisy_latents = self.scheduler.add_noise(image_latents, noise, timesteps)
 
                 # Predict the noise residual
                 noise_pred = self.unet(noisy_latents, timesteps, text_embeddings).sample

diff --git a/examples/community/lpw_stable_diffusion.py b/examples/community/lpw_stable_diffusion.py
@@ -555,7 +555,7 @@ def __call__(
         self,
         prompt: Union[str, List[str]],
         negative_prompt: Optional[Union[str, List[str]]] = None,
-        init_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
+        image: Union[torch.FloatTensor, PIL.Image.Image] = None,
         mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
         height: int = 512,
         width: int = 512,
@@ -583,11 +583,11 @@ def __call__(
             negative_prompt (`str` or `List[str]`, *optional*):
                 The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                 if `guidance_scale` is less than `1`).
-            init_image (`torch.FloatTensor` or `PIL.Image.Image`):
+            image (`torch.FloatTensor` or `PIL.Image.Image`):
                 `Image`, or tensor representing an image batch, that will be used as the starting point for the
                 process.
             mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
                 replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
                 PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
                 contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
@@ -605,11 +605,11 @@ def __call__(
                 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                 usually at the expense of lower image quality.
             strength (`float`, *optional*, defaults to 0.8):
-                Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
-                `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
+                `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
                 number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
                 noise will be maximum and the denoising process will run for the full number of iterations specified in
-                `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
             num_images_per_prompt (`int`, *optional*, defaults to 1):
                 The number of images to generate per prompt.
             eta (`float`, *optional*, defaults to 0.0):
@@ -648,6 +648,9 @@ def __call__(
             list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
             (nsfw) content, according to the `safety_checker`.
         """
+        message = "Please use `image` instead of `init_image`."
+        init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
+        image = init_image or image
 
         if isinstance(prompt, str):
             batch_size = 1
@@ -714,7 +717,7 @@ def __call__(
         mask = None
         noise = None
 
-        if init_image is None:
+        if image is None:
             # get the initial random noise unless the user supplied it
 
             # Unlike in other pipelines, latents need to be generated in the target device
@@ -753,11 +756,11 @@ def __call__(
             # scale the initial noise by the standard deviation required by the scheduler
             latents = latents * self.scheduler.init_noise_sigma
         else:
-            if isinstance(init_image, PIL.Image.Image):
-                init_image = preprocess_image(init_image)
+            if isinstance(image, PIL.Image.Image):
+                image = preprocess_image(image)
             # encode the init image into latents and scale the latents
-            init_image = init_image.to(device=self.device, dtype=latents_dtype)
-            init_latent_dist = self.vae.encode(init_image).latent_dist
+            image = image.to(device=self.device, dtype=latents_dtype)
+            init_latent_dist = self.vae.encode(image).latent_dist
             init_latents = init_latent_dist.sample(generator=generator)
             init_latents = 0.18215 * init_latents
             init_latents = torch.cat([init_latents] * batch_size * num_images_per_prompt, dim=0)
@@ -772,7 +775,7 @@ def __call__(
 
                 # check sizes
                 if not mask.shape == init_latents.shape:
-                    raise ValueError("The mask and init_image should be the same size!")
+                    raise ValueError("The mask and image should be the same size!")
 
             # get the original timestep using init_timestep
             offset = self.scheduler.config.get("steps_offset", 0)
@@ -961,7 +964,7 @@ def text2img(
 
     def img2img(
         self,
-        init_image: Union[torch.FloatTensor, PIL.Image.Image],
+        image: Union[torch.FloatTensor, PIL.Image.Image],
         prompt: Union[str, List[str]],
         negative_prompt: Optional[Union[str, List[str]]] = None,
         strength: float = 0.8,
@@ -980,7 +983,7 @@ def img2img(
         r"""
         Function for image-to-image generation.
         Args:
-            init_image (`torch.FloatTensor` or `PIL.Image.Image`):
+            image (`torch.FloatTensor` or `PIL.Image.Image`):
                 `Image`, or tensor representing an image batch, that will be used as the starting point for the
                 process.
             prompt (`str` or `List[str]`):
@@ -989,11 +992,11 @@ def img2img(
                 The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
                 if `guidance_scale` is less than `1`).
             strength (`float`, *optional*, defaults to 0.8):
-                Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
-                `init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
+                `image` will be used as a starting point, adding more noise to it the larger the `strength`. The
                 number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
                 noise will be maximum and the denoising process will run for the full number of iterations specified in
-                `num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
+                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                 expense of slower inference. This parameter will be modulated by `strength`.
@@ -1035,7 +1038,7 @@ def img2img(
         return self.__call__(
             prompt=prompt,
             negative_prompt=negative_prompt,
-            init_image=init_image,
+            image=image,
             num_inference_steps=num_inference_steps,
             guidance_scale=guidance_scale,
             strength=strength,
@@ -1052,7 +1055,7 @@ def img2img(
 
     def inpaint(
         self,
-        init_image: Union[torch.FloatTensor, PIL.Image.Image],
+        image: Union[torch.FloatTensor, PIL.Image.Image],
         mask_image: Union[torch.FloatTensor, PIL.Image.Image],
         prompt: Union[str, List[str]],
         negative_prompt: Optional[Union[str, List[str]]] = None,
@@ -1072,11 +1075,11 @@ def inpaint(
         r"""
         Function for inpaint.
         Args:
-            init_image (`torch.FloatTensor` or `PIL.Image.Image`):
+            image (`torch.FloatTensor` or `PIL.Image.Image`):
                 `Image`, or tensor representing an image batch, that will be used as the starting point for the
                 process. This is the image whose masked region will be inpainted.
             mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
+                `Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
                 replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
                 PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
                 contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
@@ -1088,7 +1091,7 @@ def inpaint(
             strength (`float`, *optional*, defaults to 0.8):
                 Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
                 is 1, the denoising process will be run on the masked area for the full number of iterations specified
-                in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more
+                in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more
                 noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
             num_inference_steps (`int`, *optional*, defaults to 50):
                 The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
@@ -1131,7 +1134,7 @@ def inpaint(
         return self.__call__(
             prompt=prompt,
             negative_prompt=negative_prompt,
-            init_image=init_image,
+            image=image,
             mask_image=mask_image,
             num_inference_steps=num_inference_steps,
             guidance_scale=guidance_scale,